/*
 * FileBackupAgent.actor.cpp
 *
 * This source file is part of the FoundationDB open source project
 *
 * Copyright 2013-2024 Apple Inc. and the FoundationDB project authors
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include "fdbclient/BlobGranuleCommon.h"
#include "fdbclient/CommitProxyInterface.h"
#include "fdbclient/DatabaseConfiguration.h"
#include "fdbclient/TenantEntryCache.actor.h"
#include "fdbclient/TenantManagement.actor.h"
#include "fdbclient/BlobRestoreCommon.h"
#include "fdbrpc/TenantInfo.h"
#include "fdbrpc/simulator.h"
#include "flow/EncryptUtils.h"
#include "flow/FastRef.h"
#include "flow/flow.h"
#include "fmt/format.h"
#include "fdbclient/BackupAgent.actor.h"
#include "fdbclient/BackupContainer.h"
#include "fdbclient/BlobCipher.h"
#include "fdbclient/ClientBooleanParams.h"
#include "fdbclient/DatabaseContext.h"
#include "fdbclient/FDBTypes.h"
#include "fdbclient/GetEncryptCipherKeys.h"
#include "fdbclient/JsonBuilder.h"
#include "fdbclient/KeyBackedTypes.actor.h"
#include "fdbclient/KeyRangeMap.h"
#include "fdbclient/Knobs.h"
#include "fdbclient/ManagementAPI.actor.h"
#include "fdbclient/PartitionedLogIterator.h"
#include "fdbclient/RestoreInterface.h"
#include "fdbclient/Status.h"
#include "fdbclient/SystemData.h"
#include "fdbclient/TaskBucket.h"
#include "fdbclient/Tenant.h"
#include "flow/network.h"
#include "flow/Trace.h"

#include <cinttypes>
#include <cstdint>
#include <ctime>
#include <climits>
#include "flow/IAsyncFile.h"
#include "flow/genericactors.actor.h"
#include "flow/Hash3.h"
#include "flow/xxhash.h"

#include <memory>
#include <numeric>
#include <boost/algorithm/string/split.hpp>
#include <boost/algorithm/string/classification.hpp>
#include <algorithm>
#include <unordered_map>
#include <utility>

#include "flow/actorcompiler.h" // This must be the last #include.

Optional<std::string> fileBackupAgentProxy = Optional<std::string>();

#define SevFRTestInfo SevVerbose
// #define SevFRTestInfo SevInfo

static std::string boolToYesOrNo(bool val) {
	return val ? std::string("Yes") : std::string("No");
}

static std::string versionToString(Optional<Version> version) {
	if (version.present())
		return std::to_string(version.get());
	else
		return "N/A";
}

static std::string timeStampToString(Optional<int64_t> epochs) {
	if (!epochs.present())
		return "N/A";
	return BackupAgentBase::formatTime(epochs.get());
}

static Future<Optional<int64_t>> getTimestampFromVersion(Optional<Version> ver,
                                                         Reference<ReadYourWritesTransaction> tr) {
	if (!ver.present())
		return Optional<int64_t>();

	return timeKeeperEpochsFromVersion(ver.get(), tr);
}

// Time format :
// <= 59 seconds
// <= 59.99 minutes
// <= 23.99 hours
// N.NN days
std::string secondsToTimeFormat(int64_t seconds) {
	if (seconds >= 86400)
		return format("%.2f day(s)", seconds / 86400.0);
	else if (seconds >= 3600)
		return format("%.2f hour(s)", seconds / 3600.0);
	else if (seconds >= 60)
		return format("%.2f minute(s)", seconds / 60.0);
	else
		return format("%lld second(s)", seconds);
}

const Key FileBackupAgent::keyLastRestorable = "last_restorable"_sr;

// For convenience
typedef FileBackupAgent::ERestoreState ERestoreState;

StringRef FileBackupAgent::restoreStateText(ERestoreState id) {
	switch (id) {
	case ERestoreState::UNINITIALIZED:
		return "uninitialized"_sr;
	case ERestoreState::QUEUED:
		return "queued"_sr;
	case ERestoreState::STARTING:
		return "starting"_sr;
	case ERestoreState::RUNNING:
		return "running"_sr;
	case ERestoreState::COMPLETED:
		return "completed"_sr;
	case ERestoreState::ABORTED:
		return "aborted"_sr;
	default:
		return "Unknown"_sr;
	}
}

Key FileBackupAgent::getPauseKey() {
	FileBackupAgent backupAgent;
	return backupAgent.taskBucket->getPauseKey();
}

ACTOR Future<std::vector<KeyBackedTag>> TagUidMap::getAll_impl(TagUidMap* tagsMap,
                                                               Reference<ReadYourWritesTransaction> tr,
                                                               Snapshot snapshot) {
	state Key prefix = tagsMap->prefix; // Copying it here as tagsMap lifetime is not tied to this actor
	TagMap::RangeResultType tagPairs = wait(tagsMap->getRange(tr, std::string(), {}, 1e6, snapshot));
	std::vector<KeyBackedTag> results;
	for (auto& p : tagPairs.results)
		results.push_back(KeyBackedTag(p.first, prefix));
	return results;
}

KeyBackedTag::KeyBackedTag(std::string tagName, StringRef tagMapPrefix)
  : KeyBackedProperty<UidAndAbortedFlagT>(TagUidMap(tagMapPrefix).getProperty(tagName)), tagName(tagName),
    tagMapPrefix(tagMapPrefix) {}

// Lists all backups and find if any partitioned backup is running.
ACTOR Future<bool> anyPartitionedBackupRunning(Reference<ReadYourWritesTransaction> tr) {
	tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
	tr->setOption(FDBTransactionOptions::LOCK_AWARE);
	std::vector<KeyBackedTag> tags = wait(getAllBackupTags(tr));

	state std::vector<Future<Optional<UidAndAbortedFlagT>>> futures;
	for (const auto& tag : tags) {
		futures.push_back(tag.get(tr));
	}

	wait(waitForAll(futures));
	state int i = 0;
	for (i = 0; i < futures.size(); i++) {
		if (futures[i].get().present()) {
			state Optional<bool> partitionedLog;
			state EBackupState eState;
			BackupConfig config(futures[i].get().get().first);

			wait(store(eState, config.stateEnum().getD(tr, Snapshot::False, EBackupState::STATE_NEVERRAN)) &&
			     store(partitionedLog, config.partitionedLogEnabled().get(tr)));
			if (FileBackupAgent::isRunnable(eState) && partitionedLog.present() && partitionedLog.get()) {
				return true;
			}
		}
	}
	return false;
}

class RestoreConfig : public KeyBackedTaskConfig {
public:
	RestoreConfig(UID uid = UID()) : KeyBackedTaskConfig(fileRestorePrefixRange.begin, uid) {}
	RestoreConfig(Reference<Task> task) : KeyBackedTaskConfig(fileRestorePrefixRange.begin, task) {}

	KeyBackedProperty<ERestoreState> stateEnum() { return configSpace.pack(__FUNCTION__sr); }
	Future<StringRef> stateText(Reference<ReadYourWritesTransaction> tr) {
		return map(stateEnum().getD(tr),
		           [](ERestoreState s) -> StringRef { return FileBackupAgent::restoreStateText(s); });
	}
	KeyBackedProperty<Key> addPrefix() { return configSpace.pack(__FUNCTION__sr); }
	KeyBackedProperty<Key> removePrefix() { return configSpace.pack(__FUNCTION__sr); }
	KeyBackedProperty<bool> onlyApplyMutationLogs() { return configSpace.pack(__FUNCTION__sr); }
	KeyBackedProperty<bool> inconsistentSnapshotOnly() { return configSpace.pack(__FUNCTION__sr); }
	KeyBackedProperty<bool> unlockDBAfterRestore() { return configSpace.pack(__FUNCTION__sr); }
	KeyBackedProperty<bool> transformPartitionedLog() { return configSpace.pack(__FUNCTION__sr); }
	// XXX: Remove restoreRange() once it is safe to remove. It has been changed to restoreRanges
	KeyBackedProperty<KeyRange> restoreRange() { return configSpace.pack(__FUNCTION__sr); }
	// XXX: Changed to restoreRangeSet. It can be removed.
	KeyBackedProperty<std::vector<KeyRange>> restoreRanges() { return configSpace.pack(__FUNCTION__sr); }
	KeyBackedSet<KeyRange> restoreRangeSet() { return configSpace.pack(__FUNCTION__sr); }
	KeyBackedProperty<Key> batchFuture() { return configSpace.pack(__FUNCTION__sr); }
	KeyBackedProperty<Version> beginVersion() { return configSpace.pack(__FUNCTION__sr); }
	KeyBackedProperty<Version> restoreVersion() { return configSpace.pack(__FUNCTION__sr); }
	KeyBackedProperty<Version> firstConsistentVersion() { return configSpace.pack(__FUNCTION__sr); }

	KeyBackedProperty<Reference<IBackupContainer>> sourceContainer() { return configSpace.pack(__FUNCTION__sr); }
	// Get the source container as a bare URL, without creating a container instance
	KeyBackedProperty<Value> sourceContainerURL() { return configSpace.pack("sourceContainer"_sr); }

	// Total bytes written by all log and range restore tasks.
	KeyBackedBinaryValue<int64_t> bytesWritten() { return configSpace.pack(__FUNCTION__sr); }
	// File blocks that have had tasks created for them by the Dispatch task
	KeyBackedBinaryValue<int64_t> filesBlocksDispatched() { return configSpace.pack(__FUNCTION__sr); }
	// File blocks whose tasks have finished
	KeyBackedBinaryValue<int64_t> fileBlocksFinished() { return configSpace.pack(__FUNCTION__sr); }
	// Total number of files in the fileMap
	KeyBackedBinaryValue<int64_t> fileCount() { return configSpace.pack(__FUNCTION__sr); }
	// Total number of file blocks in the fileMap
	KeyBackedBinaryValue<int64_t> fileBlockCount() { return configSpace.pack(__FUNCTION__sr); }

	// True for blob granule restore
	KeyBackedBinaryValue<bool> isBlobGranuleRestore() { return configSpace.pack(__FUNCTION__sr); }

	Future<std::vector<KeyRange>> getRestoreRangesOrDefault(Reference<ReadYourWritesTransaction> tr) {
		return getRestoreRangesOrDefault_impl(this, tr);
	}

	ACTOR static Future<std::vector<KeyRange>> getRestoreRangesOrDefault_impl(RestoreConfig* self,
	                                                                          Reference<ReadYourWritesTransaction> tr) {
		state std::vector<KeyRange> ranges;
		state int batchSize = BUGGIFY ? 1 : CLIENT_KNOBS->RESTORE_RANGES_READ_BATCH;
		state Optional<KeyRange> begin;
		state Arena arena;
		loop {
			KeyBackedSet<KeyRange>::RangeResultType rangeResult =
			    wait(self->restoreRangeSet().getRange(tr, begin, {}, batchSize));
			ranges.insert(ranges.end(), rangeResult.results.begin(), rangeResult.results.end());
			if (!rangeResult.more) {
				break;
			}
			ASSERT(!rangeResult.results.empty());
			begin = KeyRangeRef(KeyRef(arena, ranges.back().begin), keyAfter(ranges.back().end, arena));
		}

		// fall back to original fields if the new field is empty
		if (ranges.empty()) {
			std::vector<KeyRange> _ranges = wait(self->restoreRanges().getD(tr));
			ranges = _ranges;
			if (ranges.empty()) {
				KeyRange range = wait(self->restoreRange().getD(tr));
				ranges.push_back(range);
			}
		}
		return ranges;
	}

	// Describes a file to load blocks from during restore.  Ordered by version and then fileName to enable
	// incrementally advancing through the map, saving the version and path of the next starting point.
	struct RestoreFile {
		Version version; // this is beginVersion, not endVersion
		std::string fileName;
		bool isRange{ false }; // false for log file
		int64_t blockSize{ 0 };
		int64_t fileSize{ 0 };
		Version endVersion{ ::invalidVersion }; // not meaningful for range files
		int64_t tagId = -1; // only meaningful to log files, Log router tag. Non-negative for new backup format.
		int64_t totalTags = -1; // only meaningful to log files, Total number of log router tags.

		Tuple pack() const {
			return Tuple::makeTuple(
			    version, fileName, (int64_t)isRange, fileSize, blockSize, endVersion, tagId, totalTags);
		}
		static RestoreFile unpack(Tuple const& t) {
			RestoreFile r;
			int i = 0;
			r.version = t.getInt(i++);
			r.fileName = t.getString(i++).toString();
			r.isRange = t.getInt(i++) != 0;
			r.fileSize = t.getInt(i++);
			r.blockSize = t.getInt(i++);
			r.endVersion = t.getInt(i++);
			r.tagId = t.getInt(i++);
			r.totalTags = t.getInt(i++);
			return r;
		}
	};

	typedef KeyBackedSet<RestoreFile> FileSetT;
	FileSetT fileSet() { return configSpace.pack(__FUNCTION__sr); }

	FileSetT logFileSet() { return configSpace.pack(__FUNCTION__sr); }
	FileSetT rangeFileSet() { return configSpace.pack(__FUNCTION__sr); }

	Future<bool> isRunnable(Reference<ReadYourWritesTransaction> tr) {
		return map(stateEnum().getD(tr), [](ERestoreState s) -> bool {
			return s != ERestoreState::ABORTED && s != ERestoreState::COMPLETED && s != ERestoreState::UNINITIALIZED;
		});
	}

	Future<Void> logError(Database cx, Error e, std::string const& details, void* taskInstance = nullptr) {
		if (!uid.isValid()) {
			TraceEvent(SevError, "FileRestoreErrorNoUID").error(e).detail("Description", details);
			return Void();
		}
		TraceEvent t(SevWarn, "FileRestoreError");
		t.error(e)
		    .detail("RestoreUID", uid)
		    .detail("Description", details)
		    .detail("TaskInstance", (uint64_t)taskInstance);
		// key_not_found could happen
		if (e.code() == error_code_key_not_found)
			t.backtrace();

		return updateErrorInfo(cx, e, details);
	}

	Key mutationLogPrefix() { return uidPrefixKey(applyLogKeys.begin, uid); }

	Key applyMutationsMapPrefix() { return uidPrefixKey(applyMutationsKeyVersionMapRange.begin, uid); }

	ACTOR static Future<int64_t> getApplyVersionLag_impl(Reference<ReadYourWritesTransaction> tr, UID uid) {
		state Future<Optional<Value>> beginVal =
		    tr->get(uidPrefixKey(applyMutationsBeginRange.begin, uid), Snapshot::True);
		state Future<Optional<Value>> endVal = tr->get(uidPrefixKey(applyMutationsEndRange.begin, uid), Snapshot::True);
		wait(success(beginVal) && success(endVal));

		if (!beginVal.get().present() || !endVal.get().present())
			return 0;

		Version beginVersion = BinaryReader::fromStringRef<Version>(beginVal.get().get(), Unversioned());
		Version endVersion = BinaryReader::fromStringRef<Version>(endVal.get().get(), Unversioned());
		return endVersion - beginVersion;
	}

	Future<int64_t> getApplyVersionLag(Reference<ReadYourWritesTransaction> tr) {
		return getApplyVersionLag_impl(tr, uid);
	}

	void initApplyMutations(Reference<ReadYourWritesTransaction> tr,
	                        Key addPrefix,
	                        Key removePrefix,
	                        OnlyApplyMutationLogs onlyApplyMutationLogs) {
		// Set these because they have to match the applyMutations values.
		this->addPrefix().set(tr, addPrefix);
		this->removePrefix().set(tr, removePrefix);

		clearApplyMutationsKeys(tr);

		// Initialize add/remove prefix, range version map count and set the map's start key to InvalidVersion
		tr->set(uidPrefixKey(applyMutationsAddPrefixRange.begin, uid), addPrefix);
		tr->set(uidPrefixKey(applyMutationsRemovePrefixRange.begin, uid), removePrefix);

		int64_t startCount = 0;
		tr->set(uidPrefixKey(applyMutationsKeyVersionCountRange.begin, uid), StringRef((uint8_t*)&startCount, 8));
		Key mapStart = uidPrefixKey(applyMutationsKeyVersionMapRange.begin, uid);
		tr->set(mapStart, BinaryWriter::toValue<Version>(invalidVersion, Unversioned()));
	}

	void clearApplyMutationsKeys(Reference<ReadYourWritesTransaction> tr) {
		tr->setOption(FDBTransactionOptions::COMMIT_ON_FIRST_PROXY);

		// Clear add/remove prefix keys
		tr->clear(uidPrefixKey(applyMutationsAddPrefixRange.begin, uid));
		tr->clear(uidPrefixKey(applyMutationsRemovePrefixRange.begin, uid));

		// Clear range version map and count key
		tr->clear(uidPrefixKey(applyMutationsKeyVersionCountRange.begin, uid));
		Key mapStart = uidPrefixKey(applyMutationsKeyVersionMapRange.begin, uid);
		tr->clear(KeyRangeRef(mapStart, strinc(mapStart)));

		// Clear any loaded mutations that have not yet been applied
		Key mutationPrefix = mutationLogPrefix();
		tr->clear(KeyRangeRef(mutationPrefix, strinc(mutationPrefix)));

		// Clear end and begin versions (intentionally in this order)
		tr->clear(uidPrefixKey(applyMutationsEndRange.begin, uid));
		tr->clear(uidPrefixKey(applyMutationsBeginRange.begin, uid));
	}

	void setApplyBeginVersion(Reference<ReadYourWritesTransaction> tr, Version ver) {
		tr->set(uidPrefixKey(applyMutationsBeginRange.begin, uid), BinaryWriter::toValue(ver, Unversioned()));
	}

	Future<Version> getApplyBeginVersion(Reference<ReadYourWritesTransaction> tr) {
		return map(tr->get(uidPrefixKey(applyMutationsBeginRange.begin, uid)),
		           [=](Optional<Value> const& value) -> Version {
			           return value.present() ? BinaryReader::fromStringRef<Version>(value.get(), Unversioned()) : 0;
		           });
	}

	void setApplyEndVersion(Reference<ReadYourWritesTransaction> tr, Version ver) {
		tr->set(uidPrefixKey(applyMutationsEndRange.begin, uid), BinaryWriter::toValue(ver, Unversioned()));
	}

	Future<Version> getApplyEndVersion(Reference<ReadYourWritesTransaction> tr) {
		return map(tr->get(uidPrefixKey(applyMutationsEndRange.begin, uid)),
		           [=](Optional<Value> const& value) -> Version {
			           return value.present() ? BinaryReader::fromStringRef<Version>(value.get(), Unversioned()) : 0;
		           });
	}

	ACTOR static Future<Version> getCurrentVersion_impl(RestoreConfig* self, Reference<ReadYourWritesTransaction> tr) {
		state ERestoreState status = wait(self->stateEnum().getD(tr));
		state Version version = -1;
		if (status == ERestoreState::RUNNING) {
			wait(store(version, self->getApplyBeginVersion(tr)));
		} else if (status == ERestoreState::COMPLETED) {
			wait(store(version, self->restoreVersion().getD(tr)));
		}
		return version;
	}

	Future<Version> getCurrentVersion(Reference<ReadYourWritesTransaction> tr) {
		return getCurrentVersion_impl(this, tr);
	}

	ACTOR static Future<std::string> getProgress_impl(RestoreConfig restore, Reference<ReadYourWritesTransaction> tr);
	Future<std::string> getProgress(Reference<ReadYourWritesTransaction> tr) { return getProgress_impl(*this, tr); }

	ACTOR static Future<std::string> getFullStatus_impl(RestoreConfig restore, Reference<ReadYourWritesTransaction> tr);
	Future<std::string> getFullStatus(Reference<ReadYourWritesTransaction> tr) { return getFullStatus_impl(*this, tr); }
};

typedef RestoreConfig::RestoreFile RestoreFile;

ACTOR Future<std::string> RestoreConfig::getProgress_impl(RestoreConfig restore,
                                                          Reference<ReadYourWritesTransaction> tr) {
	tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
	tr->setOption(FDBTransactionOptions::LOCK_AWARE);

	state Future<int64_t> fileCount = restore.fileCount().getD(tr);
	state Future<int64_t> fileBlockCount = restore.fileBlockCount().getD(tr);
	state Future<int64_t> fileBlocksDispatched = restore.filesBlocksDispatched().getD(tr);
	state Future<int64_t> fileBlocksFinished = restore.fileBlocksFinished().getD(tr);
	state Future<int64_t> bytesWritten = restore.bytesWritten().getD(tr);
	state Future<StringRef> status = restore.stateText(tr);
	state Future<Version> currentVersion = restore.getCurrentVersion(tr);
	state Future<Version> lag = restore.getApplyVersionLag(tr);
	state Future<Version> firstConsistentVersion = restore.firstConsistentVersion().getD(tr);
	state Future<std::string> tag = restore.tag().getD(tr);
	state Future<std::pair<std::string, Version>> lastError = restore.lastError().getD(tr);

	// restore might no longer be valid after the first wait so make sure it is not needed anymore.
	state UID uid = restore.getUid();
	wait(success(fileCount) && success(fileBlockCount) && success(fileBlocksDispatched) &&
	     success(fileBlocksFinished) && success(bytesWritten) && success(status) && success(currentVersion) &&
	     success(lag) && success(firstConsistentVersion) && success(tag) && success(lastError));

	std::string errstr = "None";
	if (lastError.get().second != 0)
		errstr = format("'%s' %" PRId64 "s ago.\n",
		                lastError.get().first.c_str(),
		                (tr->getReadVersion().get() - lastError.get().second) / CLIENT_KNOBS->CORE_VERSIONSPERSECOND);

	TraceEvent("FileRestoreProgress")
	    .detail("RestoreUID", uid)
	    .detail("Tag", tag.get())
	    .detail("State", status.get().toString())
	    .detail("FileCount", fileCount.get())
	    .detail("FileBlocksFinished", fileBlocksFinished.get())
	    .detail("FileBlocksTotal", fileBlockCount.get())
	    .detail("FileBlocksInProgress", fileBlocksDispatched.get() - fileBlocksFinished.get())
	    .detail("BytesWritten", bytesWritten.get())
	    .detail("CurrentVersion", currentVersion.get())
	    .detail("FirstConsistentVersion", firstConsistentVersion.get())
	    .detail("ApplyLag", lag.get())
	    .detail("TaskInstance", THIS_ADDR);

	return format("Tag: %s  UID: %s  State: %s  Blocks: %lld/%lld  BlocksInProgress: %lld  Files: %lld  BytesWritten: "
	              "%lld  ApplyVersionLag: %lld  LastError: %s",
	              tag.get().c_str(),
	              uid.toString().c_str(),
	              status.get().toString().c_str(),
	              fileBlocksFinished.get(),
	              fileBlockCount.get(),
	              fileBlocksDispatched.get() - fileBlocksFinished.get(),
	              fileCount.get(),
	              bytesWritten.get(),
	              lag.get(),
	              errstr.c_str());
}

ACTOR Future<std::string> RestoreConfig::getFullStatus_impl(RestoreConfig restore,
                                                            Reference<ReadYourWritesTransaction> tr) {
	tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
	tr->setOption(FDBTransactionOptions::LOCK_AWARE);

	state Future<std::vector<KeyRange>> ranges = restore.getRestoreRangesOrDefault(tr);
	state Future<Key> addPrefix = restore.addPrefix().getD(tr);
	state Future<Key> removePrefix = restore.removePrefix().getD(tr);
	state Future<Key> url = restore.sourceContainerURL().getD(tr);
	state Future<Version> restoreVersion = restore.restoreVersion().getD(tr);
	state Future<std::string> progress = restore.getProgress(tr);

	// restore might no longer be valid after the first wait so make sure it is not needed anymore.
	wait(success(ranges) && success(addPrefix) && success(removePrefix) && success(url) && success(restoreVersion) &&
	     success(progress));

	std::string returnStr;
	returnStr = format("%s  URL: %s", progress.get().c_str(), url.get().toString().c_str());
	for (auto& range : ranges.get()) {
		returnStr += format("  Range: '%s'-'%s'", printable(range.begin).c_str(), printable(range.end).c_str());
	}
	returnStr += format("  AddPrefix: '%s'  RemovePrefix: '%s'  Version: %lld",
	                    printable(addPrefix.get()).c_str(),
	                    printable(removePrefix.get()).c_str(),
	                    restoreVersion.get());
	return returnStr;
}

// two buffers are alternatively serving data and reading data from file
// thus when one buffer is serving data through peek()
// the other buffer is reading data from file to provide pipelining.
class TwoBuffers : public ReferenceCounted<TwoBuffers>, NonCopyable {
public:
	class IteratorBuffer : public ReferenceCounted<IteratorBuffer> {
	public:
		std::shared_ptr<char[]> data;
		// has_value means there is data, otherwise it means there is no data being fetched or ready
		// is_valid means data is being fetched, is_ready means data is ready
		std::optional<Future<Void>> fetchingData;
		size_t size;
		int index;
		int capacity;
		IteratorBuffer(int _capacity) {
			capacity = _capacity;
			data = std::shared_ptr<char[]>(new char[capacity]());
			fetchingData.reset();
			size = 0;
		}
		bool is_valid() { return fetchingData.has_value(); }
		void reset() {
			size = 0;
			index = 0;
			fetchingData.reset();
		}
	};
	TwoBuffers(int capacity, Reference<IBackupContainer> _bc, std::vector<RestoreConfig::RestoreFile>& _files, int tag);
	// ready need to be called first before calling peek
	// because a shared_ptr cannot be wrapped by a Future
	// this method ensures the current buffer has available data
	Future<Void> ready();
	ACTOR static Future<Void> ready(Reference<TwoBuffers> self);
	// fill buffer[index] with the next block of file
	// it has side effects to change currentFileIndex and currentFilePosition
	ACTOR static Future<Void> readNextBlock(Reference<TwoBuffers> self, int index);
	// peek can only be called after ready is called
	// it returns the pointer to the active buffer
	std::shared_ptr<char[]> peek();

	int getFileIndex();
	void setFileIndex(int);

	bool hasNext();

	void reset();

	// discard the current buffer and swap to the next one
	void discardAndSwap();

	// try to fill the buffer[index]
	// but no-op if the buffer have valid data or it is actively being filled
	void fillBufferIfAbsent(int index);

	size_t getBufferSize();

private:
	Reference<IteratorBuffer> buffers[2]; // Two buffers for alternating
	size_t bufferCapacity; // Size of each buffer in bytes
	Reference<IBackupContainer> bc;
	std::vector<RestoreConfig::RestoreFile> files;
	int tag;

	int cur; // Index of the current active buffer (0 or 1)
	size_t currentFileIndex; // Index of the current file being read
	size_t currentFilePosition; // Current read position in the current file
};

TwoBuffers::TwoBuffers(int capacity,
                       Reference<IBackupContainer> _bc,
                       std::vector<RestoreConfig::RestoreFile>& _files,
                       int _tag)
  : currentFileIndex(0), currentFilePosition(0), cur(0), bufferCapacity(capacity), files(_files), bc(_bc), tag(_tag) {
	buffers[0] = makeReference<IteratorBuffer>(capacity);
	buffers[1] = makeReference<IteratorBuffer>(capacity);
}

bool TwoBuffers::hasNext() {
	// if it is being load (valid but not ready, what would be the size?)
	while (currentFileIndex < files.size() && currentFilePosition >= files[currentFileIndex].fileSize) {
		currentFileIndex++;
		currentFilePosition = 0;
	}

	if (buffers[0]->is_valid() || buffers[1]->is_valid()) {
		return true;
	}

	return currentFileIndex != files.size();
}

Future<Void> TwoBuffers::ready() {
	return ready(Reference<TwoBuffers>::addRef(this));
}

ACTOR Future<Void> TwoBuffers::ready(Reference<TwoBuffers> self) {
	// if cur is not ready, then wait
	if (!self->hasNext()) {
		return Void();
	}
	// try to fill the current buffer, and wait before it is filled
	self->fillBufferIfAbsent(self->cur);
	wait(self->buffers[self->cur]->fetchingData.value());
	// try to fill the next buffer, do not wait for the filling
	if (self->hasNext()) {
		self->fillBufferIfAbsent(1 - self->cur);
	}
	return Void();
}

std::shared_ptr<char[]> TwoBuffers::peek() {
	return buffers[cur]->data;
}

int TwoBuffers::getFileIndex() {
	return buffers[cur]->index;
}

void TwoBuffers::setFileIndex(int newIndex) {
	if (newIndex < 0 || newIndex >= files.size()) {
		TraceEvent(SevError, "TwoBuffersFileIndexOutOfBound")
		    .detail("FilesSize", files.size())
		    .detail("NewIndex", newIndex)
		    .log();
	}
	currentFileIndex = newIndex;
}

void TwoBuffers::discardAndSwap() {
	// invalidate cur and change cur to next
	buffers[cur]->fetchingData.reset();
	cur = 1 - cur;
}

void TwoBuffers::reset() {
	// invalidate cur and change cur to next
	buffers[0]->reset();
	buffers[1]->reset();
	cur = 0;
	currentFileIndex = 0;
	currentFilePosition = 0;
}

size_t TwoBuffers::getBufferSize() {
	return buffers[cur]->size;
}

static double testKeyToDouble(const KeyRef& p) {
	uint64_t x = 0;
	sscanf(p.toString().c_str(), "%" SCNx64, &x);
	return *(double*)&x;
}

// only one readNextBlock can be run at a single time, otherwie the same block might be loaded twice
ACTOR Future<Void> TwoBuffers::readNextBlock(Reference<TwoBuffers> self, int index) {
	state Reference<IAsyncFile> asyncFile;
	if (self->currentFileIndex >= self->files.size()) {
		TraceEvent(SevError, "ReadNextBlockOutOfBound")
		    .detail("FileIndex", self->currentFileIndex)
		    .detail("Tag", self->tag)
		    .detail("Position", self->currentFilePosition)
		    .detail("FileSize", self->files[self->currentFileIndex].fileSize)
		    .detail("FilesCount", self->files.size())
		    .log();
		return Void();
	}
	Reference<IAsyncFile> asyncFileTmp = wait(self->bc->readFile(self->files[self->currentFileIndex].fileName));
	asyncFile = asyncFileTmp;
	state size_t fileSize = self->files[self->currentFileIndex].fileSize;
	size_t remaining = fileSize - self->currentFilePosition;
	state size_t bytesToRead = std::min(self->bufferCapacity, remaining);
	state int bytesRead = wait(
	    asyncFile->read(static_cast<void*>(self->buffers[index]->data.get()), bytesToRead, self->currentFilePosition));
	if (bytesRead != bytesToRead)
		throw restore_bad_read();
	self->buffers[index]->index = self->currentFileIndex;
	self->buffers[index]->size = bytesRead; // Set to actual bytes read
	self->currentFilePosition += bytesRead;

	return Void();
}

void TwoBuffers::fillBufferIfAbsent(int index) {
	if (buffers[index]->is_valid()) {
		// if this buffer is valid, then do not overwrite it
		return;
	}
	if (currentFileIndex == files.size()) {
		// quit if no more contents
		return;
	}
	auto self = Reference<TwoBuffers>::addRef(this);
	self->buffers[index]->fetchingData = readNextBlock(self, index);
	return;
}

bool endOfBlock(char* start, int offset) {
	const unsigned char paddingChar = '\xff';
	return (unsigned char)*(start + offset) == paddingChar;
}

class PartitionedLogIteratorSimple : public PartitionedLogIterator {
public:
	const int BATCH_READ_BLOCK_COUNT = 1;
	const int BLOCK_SIZE = CLIENT_KNOBS->BACKUP_LOGFILE_BLOCK_SIZE;
	const int mutationHeaderBytes = sizeof(int64_t) + sizeof(int32_t) + sizeof(int32_t);
	Reference<IBackupContainer> bc;
	size_t bufferCapacity;
	int tag;
	std::vector<RestoreConfig::RestoreFile> files;
	size_t bufferOffset; // Current read offset
	int bufferSize;
	int fileOffset;
	int fileIndex;
	std::shared_ptr<char[]> buffer;
	std::vector<Version> endVersions;

	PartitionedLogIteratorSimple(Reference<IBackupContainer> _bc,
	                             int _tag,
	                             std::vector<RestoreConfig::RestoreFile> _files,
	                             std::vector<Version> _endVersions);

	bool hasNext();
	Future<Void> loadNextBlock();
	ACTOR static Future<Void> loadNextBlock(Reference<PartitionedLogIteratorSimple> self);
	void removeBlockHeader();

	Standalone<VectorRef<VersionedMutation>> consumeData(Version firstVersion);

	// find the next version without advanding the iterator
	Future<Version> peekNextVersion();
	ACTOR static Future<Version> peekNextVersion(Reference<PartitionedLogIteratorSimple> iterator);

	// get all the mutations of next version and advance the iterator
	// this might issue multiple consumeData() if the data of a version cross buffer boundary
	Future<Standalone<VectorRef<VersionedMutation>>> getNext();
	ACTOR static Future<Standalone<VectorRef<VersionedMutation>>> getNext(
	    Reference<PartitionedLogIteratorSimple> iterator);
};

PartitionedLogIteratorSimple::PartitionedLogIteratorSimple(Reference<IBackupContainer> _bc,
                                                           int _tag,
                                                           std::vector<RestoreConfig::RestoreFile> _files,
                                                           std::vector<Version> _endVersions)
  : bc(_bc), tag(_tag), endVersions(_endVersions), files(std::move(_files)), bufferOffset(0) {
	bufferCapacity = BATCH_READ_BLOCK_COUNT * BLOCK_SIZE;
	buffer = std::shared_ptr<char[]>(new char[bufferCapacity]());
	fileOffset = 0;
	fileIndex = 0;
	bufferSize = 0;
}

// it will set fileOffset and fileIndex
bool PartitionedLogIteratorSimple::hasNext() {
	if (bufferOffset < bufferSize) {
		return true;
	}
	while (fileIndex < files.size() && fileOffset >= files[fileIndex].fileSize) {
		TraceEvent("ReachEndOfLogFiles")
		    .detail("BufferOffset", bufferOffset)
		    .detail("BufferSize", bufferSize)
		    .detail("FileOffset", fileOffset)
		    .detail("FileSize", files[fileIndex].fileSize)
		    .detail("FileName", files[fileIndex].fileName)
		    .detail("Tag", tag)
		    .detail("Index", fileIndex)
		    .log();
		fileOffset = 0;
		fileIndex++;
	}
	return fileIndex < files.size() && fileOffset < files[fileIndex].fileSize;
}

void PartitionedLogIteratorSimple::removeBlockHeader() {
	if (bufferOffset % BLOCK_SIZE == 0) {
		bufferOffset += sizeof(uint32_t);
	}
}

Standalone<VectorRef<VersionedMutation>> PartitionedLogIteratorSimple::consumeData(Version firstVersion) {
	Standalone<VectorRef<VersionedMutation>> mutations = Standalone<VectorRef<VersionedMutation>>();
	char* start = buffer.get();
	bool foundNewVersion = false;
	while (bufferOffset < bufferSize) {
		while (bufferOffset < bufferSize && !endOfBlock(start, bufferOffset)) {
			// for each block
			removeBlockHeader();

			// encoding format:
			// wr << bigEndian64(message.version.version) << bigEndian32(message.version.sub) <<
			// bigEndian32(mutation.size());
			Version version;
			std::memcpy(&version, start + bufferOffset, sizeof(Version));
			version = bigEndian64(version);
			if (version != firstVersion) {
				foundNewVersion = true;
				break; // Different version, stop here
			}

			int32_t subsequence;
			std::memcpy(&subsequence, start + bufferOffset + sizeof(Version), sizeof(int32_t));
			subsequence = bigEndian32(subsequence);

			int32_t mutationSize;
			std::memcpy(&mutationSize, start + bufferOffset + sizeof(Version) + sizeof(int32_t), sizeof(int32_t));
			mutationSize = bigEndian32(mutationSize);

			// assumption: the entire mutation is within the buffer
			size_t mutationTotalSize = mutationHeaderBytes + mutationSize;
			ASSERT(bufferOffset + mutationTotalSize <= bufferSize);

			// transform from stringref to mutationref here
			Standalone<StringRef> mutationData = makeString(mutationSize);
			std::memcpy(mutateString(mutationData), start + bufferOffset + mutationHeaderBytes, mutationSize);
			ArenaReader reader(mutationData.arena(), mutationData, AssumeVersion(g_network->protocolVersion()));
			MutationRef mutation;
			reader >> mutation;

			VersionedMutation vm;
			vm.version = version;
			vm.subsequence = subsequence;
			vm.mutation = mutation;
			mutations.push_back_deep(mutations.arena(), vm);
			// Move the bufferOffset to include this mutation
			bufferOffset += mutationTotalSize;
		}

		if (bufferOffset < bufferSize && endOfBlock(start, bufferOffset)) {
			// there are paddings
			int remain = BLOCK_SIZE - (bufferOffset % BLOCK_SIZE);
			bufferOffset += remain;
		}
		if (foundNewVersion) {
			break;
		}
	}

	return mutations;
}

Future<Void> PartitionedLogIteratorSimple::loadNextBlock() {
	return loadNextBlock(Reference<PartitionedLogIteratorSimple>::addRef(this));
}

ACTOR Future<Void> PartitionedLogIteratorSimple::loadNextBlock(Reference<PartitionedLogIteratorSimple> self) {
	if (self->bufferOffset < self->bufferSize) {
		// do nothing
		return Void();
	}
	if (!self->hasNext()) {
		return Void();
	}
	state Reference<IAsyncFile> asyncFile;
	Reference<IAsyncFile> asyncFileTmp = wait(self->bc->readFile(self->files[self->fileIndex].fileName));
	asyncFile = asyncFileTmp;
	state size_t fileSize = self->files[self->fileIndex].fileSize;
	size_t remaining = fileSize - self->fileOffset;
	state size_t bytesToRead = std::min(self->bufferCapacity, remaining);
	state int bytesRead =
	    wait(asyncFile->read(static_cast<void*>((self->buffer.get())), bytesToRead, self->fileOffset));
	if (bytesRead != bytesToRead)
		throw restore_bad_read();
	self->bufferSize = bytesRead; // Set to actual bytes read
	self->bufferOffset = 0; // Reset bufferOffset for the new data
	self->fileOffset += bytesRead;
	return Void();
}

Future<Version> PartitionedLogIteratorSimple::peekNextVersion() {
	return peekNextVersion(Reference<PartitionedLogIteratorSimple>::addRef(this));
}

ACTOR Future<Version> PartitionedLogIteratorSimple::peekNextVersion(Reference<PartitionedLogIteratorSimple> self) {
	// Read the first mutation's version
	if (!self->hasNext()) {
		return Version(0);
	}
	wait(self->loadNextBlock());
	self->removeBlockHeader();
	state Version version;
	std::memcpy(&version, self->buffer.get() + self->bufferOffset, sizeof(Version));
	version = bigEndian64(version);

	while (self->fileIndex < self->endVersions.size() - 1 && version >= self->endVersions[self->fileIndex]) {
		TraceEvent("SimpleIteratorFindOverlapAndSkip")
		    .detail("Version", version)
		    .detail("FileIndex", self->fileIndex)
		    .log();
		self->bufferOffset = 0;
		self->bufferSize = 0;
		self->fileOffset = 0;
		self->fileIndex += 1;
		wait(self->loadNextBlock());
		self->removeBlockHeader();
		std::memcpy(&version, self->buffer.get() + self->bufferOffset, sizeof(Version));
		version = bigEndian64(version);
	}
	return version;
}

ACTOR Future<Standalone<VectorRef<VersionedMutation>>> PartitionedLogIteratorSimple::getNext(
    Reference<PartitionedLogIteratorSimple> self) {
	state Standalone<VectorRef<VersionedMutation>> mutations;
	if (!self->hasNext()) {
		TraceEvent(SevWarn, "SimpleIteratorExhausted")
		    .detail("BufferOffset", self->bufferOffset)
		    .detail("BufferSize", self->bufferSize)
		    .detail("Tag", self->tag)
		    .log();
		return mutations;
	}
	state Version firstVersion = wait(self->peekNextVersion());
	Standalone<VectorRef<VersionedMutation>> firstBatch = self->consumeData(firstVersion);
	mutations = firstBatch;
	// If the current buffer is fully consumed, then we need to check the next buffer in case
	// the version is sliced across this buffer boundary

	while (self->bufferOffset >= self->bufferSize) {
		// data for one version cannot exceed single buffer size
		// if hitting the end of a batch, check the next batch in case version is
		if (self->hasNext()) {
			// now this is run for each block, but it is not necessary if it is the last block of a file
			// cannot check hasMoreData here because other buffer might have the last piece
			wait(self->loadNextBlock());
			Standalone<VectorRef<VersionedMutation>> batch = self->consumeData(firstVersion);
			for (const VersionedMutation& vm : batch) {
				mutations.push_back_deep(mutations.arena(), vm);
			}
		} else {
			break;
		}
	}
	return mutations;
}

Future<Standalone<VectorRef<VersionedMutation>>> PartitionedLogIteratorSimple::getNext() {
	return getNext(Reference<PartitionedLogIteratorSimple>::addRef(this));
}

class PartitionedLogIteratorTwoBuffers : public PartitionedLogIterator {
private:
	Reference<TwoBuffers> twobuffer;

	// consume single version data upto the end of the current batch
	// stop if seeing a different version from the parameter.
	// it has side effects to update bufferOffset after reading the data
	Future<Standalone<VectorRef<VersionedMutation>>> consumeData(Version firstVersion);
	ACTOR static Future<Standalone<VectorRef<VersionedMutation>>> consumeData(
	    Reference<PartitionedLogIteratorTwoBuffers> self,
	    Version v);

	// each block has a format of {<header>[mutations]<padding>}, need to skip the header to read mutations
	// this method check if bufferOffset is at the boundary and advance it if necessary
	void removeBlockHeader();

public:
	// read up to a fixed number of block count
	// noted that each version has to be contained within 2 blocks
	const int BATCH_READ_BLOCK_COUNT = 1;
	const int BLOCK_SIZE = CLIENT_KNOBS->BACKUP_LOGFILE_BLOCK_SIZE;
	const int mutationHeaderBytes = sizeof(int64_t) + sizeof(int32_t) + sizeof(int32_t);
	Reference<IBackupContainer> bc;
	int tag;
	std::vector<RestoreConfig::RestoreFile> files;
	std::vector<Version> endVersions;
	bool hasMoreData; // Flag indicating if more data is available
	size_t bufferOffset; // Current read offset
	// empty means no data, future is valid but not ready means being fetched
	// future is ready means it currently holds data

	PartitionedLogIteratorTwoBuffers(Reference<IBackupContainer> _bc,
	                                 int _tag,
	                                 std::vector<RestoreConfig::RestoreFile> _files,
	                                 std::vector<Version> _endVersions);

	// whether there are more contents for this tag in all files specified
	bool hasNext();

	// find the next version without advanding the iterator
	Future<Version> peekNextVersion();
	ACTOR static Future<Version> peekNextVersion(Reference<PartitionedLogIteratorTwoBuffers> iterator);

	// get all the mutations of next version and advance the iterator
	// this might issue multiple consumeData() if the data of a version cross buffer boundary
	Future<Standalone<VectorRef<VersionedMutation>>> getNext();
	ACTOR static Future<Standalone<VectorRef<VersionedMutation>>> getNext(
	    Reference<PartitionedLogIteratorTwoBuffers> iterator);
};

Future<Standalone<VectorRef<VersionedMutation>>> PartitionedLogIteratorTwoBuffers::consumeData(Version firstVersion) {
	return consumeData(Reference<PartitionedLogIteratorTwoBuffers>::addRef(this), firstVersion);
}

ACTOR Future<Standalone<VectorRef<VersionedMutation>>> PartitionedLogIteratorTwoBuffers::consumeData(
    Reference<PartitionedLogIteratorTwoBuffers> self,
    Version firstVersion) {
	state Standalone<VectorRef<VersionedMutation>> mutations = Standalone<VectorRef<VersionedMutation>>();
	wait(self->twobuffer->ready());
	std::shared_ptr<char[]> start = self->twobuffer->peek();
	int size = self->twobuffer->getBufferSize();
	bool foundNewVersion = false;
	while (self->bufferOffset < size) {
		while (self->bufferOffset < size && !endOfBlock(start.get(), self->bufferOffset)) {
			// for each block
			self->removeBlockHeader();

			// encoding is:
			// wr << bigEndian64(message.version.version) << bigEndian32(message.version.sub) <<
			// bigEndian32(mutation.size());
			Version version;
			std::memcpy(&version, start.get() + self->bufferOffset, sizeof(Version));
			version = bigEndian64(version);
			if (version != firstVersion) {
				foundNewVersion = true;
				break; // Different version, stop here
			}

			int32_t subsequence;
			std::memcpy(&subsequence, start.get() + self->bufferOffset + sizeof(Version), sizeof(int32_t));
			subsequence = bigEndian32(subsequence);

			int32_t mutationSize;
			std::memcpy(
			    &mutationSize, start.get() + self->bufferOffset + sizeof(Version) + sizeof(int32_t), sizeof(int32_t));
			mutationSize = bigEndian32(mutationSize);

			// assumption: the entire mutation is within the buffer
			size_t mutationTotalSize = self->mutationHeaderBytes + mutationSize;
			ASSERT(self->bufferOffset + mutationTotalSize <= size);

			Standalone<StringRef> mutationData = makeString(mutationSize);
			std::memcpy(
			    mutateString(mutationData), start.get() + self->bufferOffset + self->mutationHeaderBytes, mutationSize);
			// transform from stringref to mutationref here
			ArenaReader reader(mutationData.arena(), mutationData, AssumeVersion(g_network->protocolVersion()));
			MutationRef mutation;
			reader >> mutation;

			VersionedMutation vm;
			vm.version = version;
			vm.subsequence = subsequence;
			vm.mutation = mutation;
			mutations.push_back_deep(mutations.arena(), vm);
			// Move the bufferOffset to include this mutation
			self->bufferOffset += mutationTotalSize;
		}

		if (self->bufferOffset < size && endOfBlock(start.get(), self->bufferOffset)) {
			// there are paddings, skip them
			int remain = self->BLOCK_SIZE - (self->bufferOffset % self->BLOCK_SIZE);
			self->bufferOffset += remain;
		}
		if (foundNewVersion) {
			break;
		}
	}
	return mutations;
}

void PartitionedLogIteratorTwoBuffers::removeBlockHeader() {
	if (bufferOffset % BLOCK_SIZE == 0) {
		bufferOffset += sizeof(uint32_t);
	}
}

PartitionedLogIteratorTwoBuffers::PartitionedLogIteratorTwoBuffers(Reference<IBackupContainer> _bc,
                                                                   int _tag,
                                                                   std::vector<RestoreConfig::RestoreFile> _files,
                                                                   std::vector<Version> _endVersions)
  : bc(_bc), tag(_tag), files(std::move(_files)), endVersions(_endVersions), bufferOffset(0) {
	int bufferCapacity = BATCH_READ_BLOCK_COUNT * BLOCK_SIZE;
	twobuffer = makeReference<TwoBuffers>(bufferCapacity, _bc, files, tag);
}

bool PartitionedLogIteratorTwoBuffers::hasNext() {
	return twobuffer->hasNext();
}

Future<Version> PartitionedLogIteratorTwoBuffers::peekNextVersion() {
	return peekNextVersion(Reference<PartitionedLogIteratorTwoBuffers>::addRef(this));
}
ACTOR Future<Version> PartitionedLogIteratorTwoBuffers::peekNextVersion(
    Reference<PartitionedLogIteratorTwoBuffers> self) {
	// Read the first mutation's version
	state std::shared_ptr<char[]> start;
	state Version version;
	state int fileIndex;
	if (!self->hasNext()) {
		return Version(0);
	}
	wait(self->twobuffer->ready());
	start = self->twobuffer->peek();
	self->removeBlockHeader();
	std::memcpy(&version, start.get() + self->bufferOffset, sizeof(Version));
	version = bigEndian64(version);
	fileIndex = self->twobuffer->getFileIndex();
	while (fileIndex < self->endVersions.size() - 1 && version >= self->endVersions[fileIndex]) {
		TraceEvent("RestoreLogFilesFoundOverlapAndSkip")
		    .detail("Version", version)
		    .detail("FileIndex", fileIndex)
		    .log();
		// need to read from next file in the case of overlap range versions between log files
		self->twobuffer->reset();
		self->bufferOffset = 0;
		self->twobuffer->setFileIndex(fileIndex + 1);
		wait(self->twobuffer->ready());
		start = self->twobuffer->peek();
		self->removeBlockHeader();
		std::memcpy(&version, start.get() + self->bufferOffset, sizeof(Version));
		version = bigEndian64(version);
		fileIndex = self->twobuffer->getFileIndex();
	}
	return version;
}

ACTOR Future<Standalone<VectorRef<VersionedMutation>>> PartitionedLogIteratorTwoBuffers::getNext(
    Reference<PartitionedLogIteratorTwoBuffers> self) {
	state Standalone<VectorRef<VersionedMutation>> mutations;
	if (!self->hasNext()) {
		TraceEvent(SevWarn, "IteratorExhausted").log();
		return mutations;
	}
	state Version firstVersion = wait(self->peekNextVersion());

	Standalone<VectorRef<VersionedMutation>> firstBatch = wait(self->consumeData(firstVersion));
	mutations = firstBatch;
	// If the current buffer is fully consumed, then we need to check the next buffer in case
	// the version is sliced across this buffer boundary
	while (self->bufferOffset >= self->twobuffer->getBufferSize()) {
		self->twobuffer->discardAndSwap();
		self->bufferOffset = 0;
		// data for one version cannot exceed single buffer size
		// if hitting the end of a batch, check the next batch in case version is
		if (self->twobuffer->hasNext()) {
			// now this is run for each block, but it is not necessary if it is the last block of a file
			// cannot check hasMoreData here because other buffer might have the last piece
			Version nextVersion = wait(self->peekNextVersion());
			if (nextVersion != firstVersion) {
				break;
			}
			Standalone<VectorRef<VersionedMutation>> batch = wait(self->consumeData(firstVersion));
			for (const VersionedMutation& vm : batch) {
				mutations.push_back_deep(mutations.arena(), vm);
			}
		} else {
			break;
		}
	}
	return mutations;
}

Future<Standalone<VectorRef<VersionedMutation>>> PartitionedLogIteratorTwoBuffers::getNext() {
	return getNext(Reference<PartitionedLogIteratorTwoBuffers>::addRef(this));
}

FileBackupAgent::FileBackupAgent()
  : subspace(Subspace(fileBackupPrefixRange.begin))
    // The other subspaces have logUID -> value
    ,
    config(subspace.get(BackupAgentBase::keyConfig)), lastRestorable(subspace.get(FileBackupAgent::keyLastRestorable)),
    taskBucket(new TaskBucket(subspace.get(BackupAgentBase::keyTasks),
                              AccessSystemKeys::True,
                              PriorityBatch::False,
                              LockAware::True)),
    futureBucket(new FutureBucket(subspace.get(BackupAgentBase::keyFutures), AccessSystemKeys::True, LockAware::True)) {
}

namespace fileBackup {

// Return a block of contiguous padding bytes, growing if needed.
Value makePadding(int size) {
	static Value pad;
	if (pad.size() < size) {
		pad = makeString(size);
		memset(mutateString(pad), '\xff', pad.size());
	}

	return pad.substr(0, size);
}

struct IRangeFileWriter {
public:
	virtual Future<Void> padEnd(bool final) = 0;

	virtual Future<Void> writeKV(Key k, Value v) = 0;

	virtual Future<Void> writeKey(Key k) = 0;

	virtual Future<Void> finish() = 0;

	virtual ~IRangeFileWriter() {}
};

struct SnapshotFileBackupEncryptionKeys {
	Reference<BlobCipherKey> textCipherKey;
	Optional<Reference<BlobCipherKey>> headerCipherKey;
	StringRef ivRef;
};

// File Format handlers.
// Both Range and Log formats are designed to be readable starting at any BACKUP_RANGEFILE_BLOCK_SIZE boundary
// so they can be read in parallel.
//
// Writer instances must be kept alive while any member actors are in progress.
//
// EncryptedRangeFileWriter must be used as follows:
//   1 - writeKey(key) the queried key range begin
//   2 - writeKV(k, v) each kv pair to restore
//   3 - writeKey(key) the queried key range end
//   4 - finish()
//
// EncryptedRangeFileWriter will insert the required padding, header, and extra
// end/begin keys around the 1MB boundaries as needed.
//
// Example:
//   The range a-z is queries and returns c-j which covers 3 blocks across 2 tenants.
//   The client code writes keys in this sequence:
//             t1a t1c t1d t1e t1f t1g t2h t2i t2j t2z
//
//   H = header   P = padding   a...z = keys  v = value | = block boundary
//
//   Encoded file:  H t1a t1cv t1dv t1ev P | H t1e t1ev t1fv t1gv t2 P | H t2 t2hv t2iv t2jv t2z
//   Decoded in blocks yields:
//           Block 1: range [t1a, t1e) with kv pairs t1cv, t1dv
//           Block 2: range [t1e, t2) with kv pairs t1ev, t1fv, t1gv
//           Block 3: range [t2, t2z) with kv pairs t2hv, t2iv, t2jv
//
//   NOTE: All blocks except for the final block will have one last
//   value which will not be used.  This isn't actually a waste since
//   if the next KV pair wouldn't fit within the block after the value
//   then the space after the final key to the next 1MB boundary would
//   just be padding anyway.
//
//   NOTE: For the EncryptedRangeFileWriter blocks will be split either on the BACKUP_RANGEFILE_BLOCK_SIZE boundary or
//   when a new tenant id is encountered. If a block is split for crossing tenant boundaries then the last key will be
//   truncated to just the tenant prefix and the value will be empty (to avoid having sensitive data of one tenant be
//   encrypted with a key for a different tenant)
struct EncryptedRangeFileWriter : public IRangeFileWriter {
	EncryptedRangeFileWriter(Database cx,
	                         Arena* arena,
	                         EncryptionAtRestMode encryptMode,
	                         Optional<Reference<TenantEntryCache<Void>>> tenantCache,
	                         Reference<IBackupFile> file = Reference<IBackupFile>(),
	                         int blockSize = 0)
	  : cx(cx), arena(arena), file(file), encryptMode(encryptMode), tenantCache(tenantCache), blockSize(blockSize),
	    blockEnd(0), fileVersion(BACKUP_AGENT_ENCRYPTED_SNAPSHOT_FILE_VERSION) {
		buffer = makeString(blockSize);
		wPtr = mutateString(buffer);
	}

	ACTOR static Future<StringRef> decryptImpl(Database cx,
	                                           BlobCipherEncryptHeaderRef header,
	                                           const uint8_t* dataP,
	                                           int64_t dataLen,
	                                           Arena* arena) {
		Reference<AsyncVar<ClientDBInfo> const> dbInfo = cx->clientInfo;
		state BlobCipherEncryptHeaderRef headerRef = header;
		TextAndHeaderCipherKeys cipherKeys = wait(
		    GetEncryptCipherKeys<ClientDBInfo>::getEncryptCipherKeys(dbInfo, headerRef, BlobCipherMetrics::RESTORE));
		EncryptHeaderCipherDetails cipherDetails = headerRef.getCipherDetails();
		cipherDetails.textCipherDetails.validateCipherDetailsWithCipherKey(cipherKeys.cipherTextKey);
		if (cipherDetails.headerCipherDetails.present()) {
			cipherDetails.headerCipherDetails.get().validateCipherDetailsWithCipherKey(cipherKeys.cipherHeaderKey);
		}
		DecryptBlobCipherAes256Ctr decryptor(
		    cipherKeys.cipherTextKey, cipherKeys.cipherHeaderKey, headerRef.getIV(), BlobCipherMetrics::RESTORE);
		return decryptor.decrypt(dataP, dataLen, headerRef, *arena);
	}

	static Future<StringRef> decrypt(Database cx,
	                                 BlobCipherEncryptHeaderRef header,
	                                 const uint8_t* dataP,
	                                 int64_t dataLen,
	                                 Arena* arena) {
		return decryptImpl(cx, header, dataP, dataLen, arena);
	}

	ACTOR static Future<Reference<BlobCipherKey>> refreshKey(EncryptedRangeFileWriter* self,
	                                                         EncryptCipherDomainId domainId) {
		Reference<AsyncVar<ClientDBInfo> const> dbInfo = self->cx->clientInfo;
		TextAndHeaderCipherKeys cipherKeys =
		    wait(GetEncryptCipherKeys<ClientDBInfo>::getLatestEncryptCipherKeysForDomain(
		        dbInfo, domainId, BlobCipherMetrics::BACKUP));
		return cipherKeys.cipherTextKey;
	}

	ACTOR static Future<Void> encrypt(EncryptedRangeFileWriter* self) {
		// TODO: HeaderCipher key not needed for 'no authentication encryption'
		ASSERT(self->cipherKeys.headerCipherKey.present() && self->cipherKeys.headerCipherKey.get().isValid() &&
		       self->cipherKeys.textCipherKey.isValid());
		// Ensure that the keys we got are still valid before flushing the block
		if (self->cipherKeys.headerCipherKey.get()->isExpired() ||
		    self->cipherKeys.headerCipherKey.get()->needsRefresh()) {
			Reference<BlobCipherKey> cipherKey =
			    wait(refreshKey(self, self->cipherKeys.headerCipherKey.get()->getDomainId()));
			self->cipherKeys.headerCipherKey = cipherKey;
		}
		if (self->cipherKeys.textCipherKey->isExpired() || self->cipherKeys.textCipherKey->needsRefresh()) {
			Reference<BlobCipherKey> cipherKey = wait(refreshKey(self, self->cipherKeys.textCipherKey->getDomainId()));
			self->cipherKeys.textCipherKey = cipherKey;
		}
		EncryptBlobCipherAes265Ctr encryptor(
		    self->cipherKeys.textCipherKey,
		    self->cipherKeys.headerCipherKey,
		    self->cipherKeys.ivRef.begin(),
		    AES_256_IV_LENGTH,
		    getEncryptAuthTokenMode(EncryptAuthTokenMode::ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE),
		    BlobCipherMetrics::BACKUP);
		int64_t payloadSize = self->wPtr - self->dataPayloadStart;
		StringRef encryptedData;
		BlobCipherEncryptHeaderRef headerRef;
		encryptedData = encryptor.encrypt(self->dataPayloadStart, payloadSize, &headerRef, *self->arena);
		Standalone<StringRef> serialized = BlobCipherEncryptHeaderRef::toStringRef(headerRef);
		self->arena->dependsOn(serialized.arena());
		ASSERT(serialized.size() == self->encryptHeader.size());
		std::memcpy(mutateString(self->encryptHeader), serialized.begin(), self->encryptHeader.size());

		// re-write encrypted data to buffer
		std::memcpy(self->dataPayloadStart, encryptedData.begin(), payloadSize);
		return Void();
	}

	ACTOR static Future<Void> updateEncryptionKeysCtx(EncryptedRangeFileWriter* self,
	                                                  KeyRef key,
	                                                  SnapshotBackupUseTenantCache checkTenantCache) {
		state EncryptCipherDomainId curDomainId =
		    wait(getEncryptionDomainDetails(key, self->encryptMode, self->tenantCache, checkTenantCache));
		state Reference<AsyncVar<ClientDBInfo> const> dbInfo = self->cx->clientInfo;

		// Get text and header cipher key
		TextAndHeaderCipherKeys textAndHeaderCipherKeys =
		    wait(GetEncryptCipherKeys<ClientDBInfo>::getLatestEncryptCipherKeysForDomain(
		        dbInfo, curDomainId, BlobCipherMetrics::BACKUP));
		self->cipherKeys.textCipherKey = textAndHeaderCipherKeys.cipherTextKey;
		self->cipherKeys.headerCipherKey = textAndHeaderCipherKeys.cipherHeaderKey;

		// Set ivRef
		self->cipherKeys.ivRef = makeString(AES_256_IV_LENGTH, *self->arena);
		deterministicRandom()->randomBytes(mutateString(self->cipherKeys.ivRef), AES_256_IV_LENGTH);
		return Void();
	}

	// Returns the number of bytes that have been written to the buffer
	static int64_t currentBufferSize(EncryptedRangeFileWriter* self) { return self->wPtr - self->buffer.begin(); }

	static int64_t expectedFileSize(EncryptedRangeFileWriter* self) {
		// Return what has already been written to file plus the size of the current buffer
		// which indicates how many bytes the file will contain once the buffer is written
		return self->file->size() + currentBufferSize(self);
	}

	static void copyToBuffer(EncryptedRangeFileWriter* self, const void* src, size_t size) {
		if (size > 0) {
			std::memcpy(self->wPtr, src, size);
			self->wPtr += size;
			ASSERT(currentBufferSize(self) <= self->blockSize);
		}
	}

	static void appendStringRefWithLenToBuffer(EncryptedRangeFileWriter* self, StringRef* s) {
		// Append the string length followed by the string to the buffer
		uint32_t lenBuf = bigEndian32((uint32_t)s->size());
		copyToBuffer(self, &lenBuf, sizeof(lenBuf));
		copyToBuffer(self, s->begin(), s->size());
	}

	ACTOR static Future<EncryptCipherDomainId> getEncryptionDomainDetails(
	    KeyRef key,
	    EncryptionAtRestMode encryptMode,
	    Optional<Reference<TenantEntryCache<Void>>> tenantCache,
	    SnapshotBackupUseTenantCache checkTenantCache) {
		if (isSystemKey(key)) {
			return SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID;
		}
		if (key.size() < TenantAPI::PREFIX_SIZE || encryptMode.mode == EncryptionAtRestMode::CLUSTER_AWARE) {
			return FDB_DEFAULT_ENCRYPT_DOMAIN_ID;
		}
		// dealing with domain aware encryption so all keys should belong to a tenant
		KeyRef tenantPrefix = KeyRef(key.begin(), TenantAPI::PREFIX_SIZE);
		state int64_t tenantId = TenantAPI::prefixToId(tenantPrefix);
		// It's possible for the first and last key in a block (when writeKey is called) to not have a valid tenant
		// prefix, since they mark the start and end of a range, in that case we denote them as having a default encrypt
		// domain for the purpose of encrypting the block
		if (checkTenantCache && tenantCache.present()) {
			Optional<TenantEntryCachePayload<Void>> payload = wait(tenantCache.get()->getById(tenantId));
			if (!payload.present()) {
				return FDB_DEFAULT_ENCRYPT_DOMAIN_ID;
			}
		}
		return tenantId;
	}

	// Handles the first block and internal blocks.  Ends current block if needed.
	// The final flag is used in simulation to pad the file's final block to a whole block size
	ACTOR static Future<Void> newBlock(EncryptedRangeFileWriter* self,
	                                   int bytesNeeded,
	                                   KeyRef lastKey,
	                                   bool writeValue,
	                                   bool final = false) {
		// Write padding to finish current block if needed
		int bytesLeft = self->blockEnd - expectedFileSize(self);
		ASSERT(bytesLeft >= 0);
		if (bytesLeft > 0) {
			state Value paddingFFs = makePadding(bytesLeft);
			copyToBuffer(self, paddingFFs.begin(), bytesLeft);
		}

		if (expectedFileSize(self) > 0) {
			// write buffer to file since block is finished
			ASSERT(currentBufferSize(self) == self->blockSize);
			wait(encrypt(self));
			wait(self->file->append(self->buffer.begin(), self->blockSize));

			// reset write pointer to beginning of StringRef
			self->wPtr = mutateString(self->buffer);
		}

		if (final) {
			ASSERT(g_network->isSimulated());
			return Void();
		}

		// Set new blockEnd
		self->blockEnd += self->blockSize;

		// write Header
		copyToBuffer(self, (uint8_t*)&self->fileVersion, sizeof(self->fileVersion));

		// calculate encryption header size
		uint32_t headerSize = 0;
		EncryptAuthTokenMode authTokenMode =
		    getEncryptAuthTokenMode(EncryptAuthTokenMode::ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE);
		EncryptAuthTokenAlgo authTokenAlgo = getAuthTokenAlgoFromMode(authTokenMode);
		headerSize =
		    BlobCipherEncryptHeaderRef::getHeaderSize(CLIENT_KNOBS->ENCRYPT_HEADER_FLAGS_VERSION,
		                                              getEncryptCurrentAlgoHeaderVersion(authTokenMode, authTokenAlgo),
		                                              ENCRYPT_CIPHER_MODE_AES_256_CTR,
		                                              authTokenMode,
		                                              authTokenAlgo);
		ASSERT(headerSize > 0);
		// write header size to buffer
		copyToBuffer(self, (uint8_t*)&headerSize, sizeof(headerSize));
		// leave space for encryption header
		self->encryptHeader = StringRef(self->wPtr, headerSize);
		self->wPtr += headerSize;
		self->dataPayloadStart = self->wPtr;

		// If this is NOT the first block then write duplicate stuff needed from last block
		if (self->blockEnd > self->blockSize) {
			appendStringRefWithLenToBuffer(self, &lastKey);
			appendStringRefWithLenToBuffer(self, &self->lastKey);
			if (writeValue) {
				appendStringRefWithLenToBuffer(self, &self->lastValue);
			}
		}

		// There must now be room in the current block for bytesNeeded or the block size is too small
		if (expectedFileSize(self) + bytesNeeded > self->blockEnd) {
			throw backup_bad_block_size();
		}

		return Void();
	}

	Future<Void> padEnd(bool final) {
		if (expectedFileSize(this) > 0) {
			return newBlock(this, 0, StringRef(), true, final);
		}
		return Void();
	}

	// Ends the current block if necessary based on bytesNeeded.
	ACTOR static Future<Void> newBlockIfNeeded(EncryptedRangeFileWriter* self, int bytesNeeded) {
		if (expectedFileSize(self) + bytesNeeded > self->blockEnd) {
			wait(newBlock(self, bytesNeeded, self->lastKey, true));
		}
		return Void();
	}

	ACTOR static Future<Void> handleTenantBondary(EncryptedRangeFileWriter* self,
	                                              Key k,
	                                              Value v,
	                                              bool writeValue,
	                                              EncryptCipherDomainId curKeyDomainId,
	                                              SnapshotBackupUseTenantCache checkTenantCache) {
		state KeyRef endKey = k;
		// If we are crossing a boundary with a key that has a tenant prefix then truncate it
		if (curKeyDomainId != SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID && curKeyDomainId != FDB_DEFAULT_ENCRYPT_DOMAIN_ID) {
			endKey = StringRef(k.begin(), TenantAPI::PREFIX_SIZE);
		}

		state ValueRef newValue = StringRef();
		self->lastKey = k;
		self->lastValue = v;
		appendStringRefWithLenToBuffer(self, &endKey);
		appendStringRefWithLenToBuffer(self, &newValue);
		wait(newBlock(self, 0, endKey, writeValue));
		wait(updateEncryptionKeysCtx(self, self->lastKey, checkTenantCache));
		return Void();
	}

	ACTOR static Future<bool> finishCurTenantBlockStartNewIfNeeded(EncryptedRangeFileWriter* self,
	                                                               Key k,
	                                                               Value v,
	                                                               bool writeValue,
	                                                               SnapshotBackupUseTenantCache checkTenantCache) {
		// Don't want to start a new block if the current key or previous key is empty
		if (self->lastKey.size() == 0 || k.size() == 0) {
			return false;
		}
		state EncryptCipherDomainId curKeyDomainId =
		    wait(getEncryptionDomainDetails(k, self->encryptMode, self->tenantCache, checkTenantCache));
		state EncryptCipherDomainId prevKeyDomainId =
		    wait(getEncryptionDomainDetails(self->lastKey, self->encryptMode, self->tenantCache, checkTenantCache));
		if (curKeyDomainId != prevKeyDomainId) {
			CODE_PROBE(true, "crossed tenant boundaries");
			wait(handleTenantBondary(self, k, v, writeValue, curKeyDomainId, checkTenantCache));
			return true;
		}
		return false;
	}

	// Start a new block if needed, then write the key and value
	ACTOR static Future<Void> writeKV_impl(EncryptedRangeFileWriter* self, Key k, Value v) {
		if (!self->cipherKeys.headerCipherKey.present() || !self->cipherKeys.headerCipherKey.get().isValid() ||
		    !self->cipherKeys.textCipherKey.isValid()) {
			wait(updateEncryptionKeysCtx(self, k, SnapshotBackupUseTenantCache::False));
		}
		state int toWrite = sizeof(int32_t) + k.size() + sizeof(int32_t) + v.size();
		wait(newBlockIfNeeded(self, toWrite));
		bool createdNewBlock =
		    wait(finishCurTenantBlockStartNewIfNeeded(self, k, v, true, SnapshotBackupUseTenantCache::False));
		if (createdNewBlock) {
			return Void();
		}
		appendStringRefWithLenToBuffer(self, &k);
		appendStringRefWithLenToBuffer(self, &v);
		self->lastKey = k;
		self->lastValue = v;
		return Void();
	}

	Future<Void> writeKV(Key k, Value v) { return writeKV_impl(this, k, v); }

	// Write begin key or end key.
	ACTOR static Future<Void> writeKey_impl(EncryptedRangeFileWriter* self, Key k) {
		// TODO (Nim): Is it possible to write empty begin and end keys?
		if (k.size() > 0 &&
		    (!self->cipherKeys.headerCipherKey.present() || !self->cipherKeys.headerCipherKey.get().isValid() ||
		     !self->cipherKeys.textCipherKey.isValid())) {
			wait(updateEncryptionKeysCtx(self, k, SnapshotBackupUseTenantCache::True));
		}
		// Need to account for extra "empty" value being written in the case of crossing tenant boundaries
		int toWrite = sizeof(uint32_t) + k.size() + sizeof(uint32_t);
		wait(newBlockIfNeeded(self, toWrite));
		// We want to check the tenant cache here since the first/last key for a block may not be a valid KV pair (in
		// which case we use the default domain)
		bool createdNewBlock =
		    wait(finishCurTenantBlockStartNewIfNeeded(self, k, StringRef(), false, SnapshotBackupUseTenantCache::True));
		if (createdNewBlock) {
			return Void();
		}
		appendStringRefWithLenToBuffer(self, &k);
		self->lastKey = k;
		return Void();
	}

	Future<Void> writeKey(Key k) { return writeKey_impl(this, k); }

	ACTOR static Future<Void> finish_impl(EncryptedRangeFileWriter* self) {
		// Write any outstanding bytes to the file
		if (currentBufferSize(self) > 0) {
			wait(encrypt(self));
			wait(self->file->append(self->buffer.begin(), currentBufferSize(self)));
		}
		return Void();
	}

	Future<Void> finish() { return finish_impl(this); }

	Database cx;
	Arena* arena;
	EncryptionAtRestMode encryptMode;
	Reference<IBackupFile> file;
	Optional<Reference<TenantEntryCache<Void>>> tenantCache;
	int blockSize;

private:
	Standalone<StringRef> buffer;
	uint8_t* wPtr;
	StringRef encryptHeader;
	uint8_t* dataPayloadStart;
	int64_t blockEnd;
	uint32_t fileVersion;
	Key lastKey;
	Key lastValue;
	SnapshotFileBackupEncryptionKeys cipherKeys;
};

// File Format handlers.
// Both Range and Log formats are designed to be readable starting at any BACKUP_RANGEFILE_BLOCK_SIZE boundary
// so they can be read in parallel.
//
// Writer instances must be kept alive while any member actors are in progress.
//
// RangeFileWriter must be used as follows:
//   1 - writeKey(key) the queried key range begin
//   2 - writeKV(k, v) each kv pair to restore
//   3 - writeKey(key) the queried key range end
//	 4 - finish()
//
// RangeFileWriter will insert the required padding, header, and extra
// end/begin keys around the 1MB boundaries as needed.
//
// Example:
//   The range a-z is queries and returns c-j which covers 3 blocks.
//   The client code writes keys in this sequence:
//             a c d e f g h i j z
//
//   H = header   P = padding   a...z = keys  v = value | = block boundary
//
//   Encoded file:  H a cv dv ev P | H e ev fv gv hv P | H h hv iv jv z
//   Decoded in blocks yields:
//           Block 1: range [a, e) with kv pairs cv, dv
//           Block 2: range [e, h) with kv pairs ev, fv, gv
//           Block 3: range [h, z) with kv pairs hv, iv, jv
//
//   NOTE: All blocks except for the final block will have one last
//   value which will not be used.  This isn't actually a waste since
//   if the next KV pair wouldn't fit within the block after the value
//   then the space after the final key to the next 1MB boundary would
//   just be padding anyway.
struct RangeFileWriter : public IRangeFileWriter {
	RangeFileWriter(Reference<IBackupFile> file = Reference<IBackupFile>(), int blockSize = 0)
	  : file(file), blockSize(blockSize), blockEnd(0), fileVersion(BACKUP_AGENT_SNAPSHOT_FILE_VERSION) {}

	// Handles the first block and internal blocks.  Ends current block if needed.
	// The final flag is used in simulation to pad the file's final block to a whole block size
	ACTOR static Future<Void> newBlock(RangeFileWriter* self, int bytesNeeded, bool final = false) {
		// Write padding to finish current block if needed
		int bytesLeft = self->blockEnd - self->file->size();
		if (bytesLeft > 0) {
			state Value paddingFFs = makePadding(bytesLeft);
			wait(self->file->append(paddingFFs.begin(), bytesLeft));
		}

		if (final) {
			ASSERT(g_network->isSimulated());
			return Void();
		}

		// Set new blockEnd
		self->blockEnd += self->blockSize;

		// write Header
		wait(self->file->append((uint8_t*)&self->fileVersion, sizeof(self->fileVersion)));

		// If this is NOT the first block then write duplicate stuff needed from last block
		if (self->blockEnd > self->blockSize) {
			wait(self->file->appendStringRefWithLen(self->lastKey));
			wait(self->file->appendStringRefWithLen(self->lastKey));
			wait(self->file->appendStringRefWithLen(self->lastValue));
		}

		// There must now be room in the current block for bytesNeeded or the block size is too small
		if (self->file->size() + bytesNeeded > self->blockEnd)
			throw backup_bad_block_size();

		return Void();
	}

	// Used in simulation only to create backup file sizes which are an integer multiple of the block size
	Future<Void> padEnd(bool final) {
		ASSERT(g_network->isSimulated());
		if (file->size() > 0) {
			return newBlock(this, 0, final);
		}
		return Void();
	}

	// Ends the current block if necessary based on bytesNeeded.
	Future<Void> newBlockIfNeeded(int bytesNeeded) {
		if (file->size() + bytesNeeded > blockEnd)
			return newBlock(this, bytesNeeded);
		return Void();
	}

	// Start a new block if needed, then write the key and value
	ACTOR static Future<Void> writeKV_impl(RangeFileWriter* self, Key k, Value v) {
		int toWrite = sizeof(int32_t) + k.size() + sizeof(int32_t) + v.size();
		wait(self->newBlockIfNeeded(toWrite));
		wait(self->file->appendStringRefWithLen(k));
		wait(self->file->appendStringRefWithLen(v));
		self->lastKey = k;
		self->lastValue = v;
		return Void();
	}

	Future<Void> writeKV(Key k, Value v) { return writeKV_impl(this, k, v); }

	// Write begin key or end key.
	ACTOR static Future<Void> writeKey_impl(RangeFileWriter* self, Key k) {
		int toWrite = sizeof(uint32_t) + k.size();
		wait(self->newBlockIfNeeded(toWrite));
		wait(self->file->appendStringRefWithLen(k));
		return Void();
	}

	Future<Void> writeKey(Key k) { return writeKey_impl(this, k); }

	Future<Void> finish() { return Void(); }

	Reference<IBackupFile> file;
	int blockSize;

private:
	int64_t blockEnd;
	uint32_t fileVersion;
	Key lastKey;
	Key lastValue;
};

ACTOR static Future<Void> decodeKVPairs(StringRefReader* reader,
                                        Standalone<VectorRef<KeyValueRef>>* results,
                                        bool encryptedBlock,
                                        EncryptionAtRestMode encryptMode,
                                        Optional<int64_t> blockDomainId,
                                        Optional<Reference<TenantEntryCache<Void>>> tenantCache) {
	// Read begin key, if this fails then block was invalid.
	state uint32_t kLen = reader->consumeNetworkUInt32();
	state const uint8_t* k = reader->consume(kLen);
	results->push_back(results->arena(), KeyValueRef(KeyRef(k, kLen), ValueRef()));
	state KeyRef prevKey = KeyRef(k, kLen);
	state bool done = false;
	state Optional<EncryptCipherDomainId> prevDomainId;
	// Read kv pairs and end key
	while (1) {
		// Read a key.
		kLen = reader->consumeNetworkUInt32();
		k = reader->consume(kLen);

		// make sure that all keys in a block belong to exactly one tenant,
		// unless its the last key in which case it can be a truncated (different) tenant prefix
		if (encryptedBlock && g_network && g_network->isSimulated()) {
			ASSERT(blockDomainId.present());
			state KeyRef curKey = KeyRef(k, kLen);
			if (!prevDomainId.present()) {
				EncryptCipherDomainId domainId = wait(EncryptedRangeFileWriter::getEncryptionDomainDetails(
				    prevKey, encryptMode, tenantCache, SnapshotBackupUseTenantCache::False));
				prevDomainId = domainId;
			}
			state EncryptCipherDomainId curDomainId = wait(EncryptedRangeFileWriter::getEncryptionDomainDetails(
			    curKey, encryptMode, tenantCache, SnapshotBackupUseTenantCache::False));
			if (!curKey.empty() && !prevKey.empty() && prevDomainId.get() != curDomainId) {
				ASSERT(!done);
				// Make sure that all tenant specific keys in a block have the correct prefix size
				if (curDomainId != SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID && curDomainId != FDB_DEFAULT_ENCRYPT_DOMAIN_ID &&
				    curKey.size() != TenantAPI::PREFIX_SIZE) {
					ASSERT(tenantCache.present());
					Optional<TenantEntryCachePayload<Void>> payload = wait(tenantCache.get()->getById(curDomainId));
					ASSERT(!payload.present());
				}
				done = true;
			}
			// make sure that all keys (except the last key) in a block are encrypted using the correct key.;
			if (blockDomainId.get() != FDB_DEFAULT_ENCRYPT_DOMAIN_ID && !prevKey.empty()) {
				ASSERT_EQ(prevDomainId.get(), blockDomainId.get());
			}
			prevKey = curKey;
			prevDomainId = curDomainId;
		}

		// If eof reached or first value len byte is 0xFF then a valid block end was reached.
		if (reader->eof() || *reader->rptr == 0xFF) {
			results->push_back(results->arena(), KeyValueRef(KeyRef(k, kLen), ValueRef()));
			break;
		}

		// Read a value, which must exist or the block is invalid
		state uint32_t vLen = reader->consumeNetworkUInt32();
		state const uint8_t* v = reader->consume(vLen);
		if (tenantCache.present() && !isSystemKey(KeyRef(k, kLen))) {
			state int64_t tenantId = TenantAPI::extractTenantIdFromKeyRef(StringRef(k, kLen));
			Optional<TenantEntryCachePayload<Void>> payload = wait(tenantCache.get()->getById(tenantId));
			// The first and last KV pairs are not restored so if the tenant is not found for the last key then it's ok
			// to include it in the restore set
			if (!payload.present() && !(reader->eof() || *reader->rptr == 0xFF)) {
				TraceEvent(SevWarnAlways, "SnapshotRestoreTenantNotFound").detail("TenantId", tenantId);
				CODE_PROBE(true, "Snapshot restore tenant not found");
			} else {
				results->push_back(results->arena(), KeyValueRef(KeyRef(k, kLen), ValueRef(v, vLen)));
			}
		} else {
			results->push_back(results->arena(), KeyValueRef(KeyRef(k, kLen), ValueRef(v, vLen)));
		}

		// If eof reached or first byte of next key len is 0xFF then a valid block end was reached.
		if (reader->eof() || *reader->rptr == 0xFF)
			break;
	}

	// Make sure any remaining bytes in the block are 0xFF
	for (auto b : reader->remainder())
		if (b != 0xFF)
			throw restore_corrupted_data_padding();

	return Void();
}

static Reference<IBackupContainer> getBackupContainerWithProxy(Reference<IBackupContainer> _bc) {
	Reference<IBackupContainer> bc = IBackupContainer::openContainer(_bc->getURL(), fileBackupAgentProxy, {});
	return bc;
}

Standalone<VectorRef<KeyValueRef>> decodeRangeFileBlock(const Standalone<StringRef>& buf) {
	Standalone<VectorRef<KeyValueRef>> results({}, buf.arena());
	StringRefReader reader(buf, restore_corrupted_data());

	// Read header, currently only decoding BACKUP_AGENT_SNAPSHOT_FILE_VERSION
	if (reader.consume<int32_t>() != BACKUP_AGENT_SNAPSHOT_FILE_VERSION)
		throw restore_unsupported_file_version();

	// Read begin key, if this fails then block was invalid.
	uint32_t kLen = reader.consumeNetworkUInt32();
	const uint8_t* k = reader.consume(kLen);
	results.push_back(results.arena(), KeyValueRef(KeyRef(k, kLen), ValueRef()));

	// Read kv pairs and end key
	while (1) {
		// If eof reached or first value len byte is 0xFF then a valid block end was reached.
		if (reader.eof() || *reader.rptr == 0xFF) {
			break;
		}

		// Read a value, which must exist or the block is invalid
		uint32_t vLen = reader.consumeNetworkUInt32();
		const uint8_t* v = reader.consume(vLen);
		results.push_back(results.arena(), KeyValueRef(KeyRef(k, kLen), ValueRef(v, vLen)));
	}

	// Make sure any remaining bytes in the block are 0xFF
	for (auto b : reader.remainder())
		if (b != 0xFF)
			throw restore_corrupted_data_padding();

	return results;
}

ACTOR Future<Standalone<VectorRef<KeyValueRef>>> decodeRangeFileBlock(Reference<IAsyncFile> file,
                                                                      int64_t offset,
                                                                      int len,
                                                                      Database cx) {
	state Standalone<StringRef> buf = makeString(len);
	int rLen = wait(uncancellable(holdWhile(buf, file->read(mutateString(buf), len, offset))));
	if (rLen != len)
		throw restore_bad_read();

	simulateBlobFailure();

	state Standalone<VectorRef<KeyValueRef>> results({}, buf.arena());
	state StringRefReader reader(buf, restore_corrupted_data());
	state Arena arena;
	state DatabaseConfiguration config = wait(getDatabaseConfiguration(cx));
	state Optional<Reference<TenantEntryCache<Void>>> tenantCache;
	if (config.tenantMode == TenantMode::REQUIRED) {
		tenantCache = makeReference<TenantEntryCache<Void>>(cx, TenantEntryCacheRefreshMode::WATCH);
		wait(tenantCache.get()->init());
	}
	state EncryptionAtRestMode encryptMode = config.encryptionAtRestMode;
	state int64_t blockDomainId = TenantInfo::INVALID_TENANT;

	try {
		// Read header, currently only decoding BACKUP_AGENT_SNAPSHOT_FILE_VERSION or
		// BACKUP_AGENT_ENCRYPTED_SNAPSHOT_FILE_VERSION
		int32_t file_version = reader.consume<int32_t>();
		ASSERT(!encryptMode.isEncryptionEnabled() || file_version == BACKUP_AGENT_ENCRYPTED_SNAPSHOT_FILE_VERSION);
		if (file_version == BACKUP_AGENT_SNAPSHOT_FILE_VERSION) {
			wait(decodeKVPairs(&reader, &results, false, encryptMode, Optional<int64_t>(), tenantCache));
		} else if (file_version == BACKUP_AGENT_ENCRYPTED_SNAPSHOT_FILE_VERSION) {
			CODE_PROBE(true, "decoding encrypted block", probe::decoration::rare);
			// read header size
			state uint32_t headerLen = reader.consume<uint32_t>();
			// read the encryption header
			state const uint8_t* headerStart = reader.consume(headerLen);
			StringRef headerS = StringRef(headerStart, headerLen);
			state BlobCipherEncryptHeaderRef encryptHeader;

			encryptHeader = BlobCipherEncryptHeaderRef::fromStringRef(headerS);
			blockDomainId = encryptHeader.getCipherDetails().textCipherDetails.encryptDomainId;

			if (config.tenantMode == TenantMode::REQUIRED && !isReservedEncryptDomain(blockDomainId)) {
				ASSERT(tenantCache.present());
				Optional<TenantEntryCachePayload<Void>> payload = wait(tenantCache.get()->getById(blockDomainId));
				if (!payload.present()) {
					throw tenant_not_found();
				}
			}
			const uint8_t* dataPayloadStart = headerStart + headerLen;
			// calculate the total bytes read up to (and including) the header
			int64_t bytesRead = sizeof(int32_t) + sizeof(uint32_t) + headerLen;
			// get the size of the encrypted payload and decrypt it
			int64_t dataLen = len - bytesRead;
			StringRef decryptedData =
			    wait(EncryptedRangeFileWriter::decrypt(cx, encryptHeader, dataPayloadStart, dataLen, &results.arena()));
			reader = StringRefReader(decryptedData, restore_corrupted_data());
			wait(decodeKVPairs(&reader, &results, true, encryptMode, blockDomainId, tenantCache));
		} else {
			throw restore_unsupported_file_version();
		}
		return results;
	} catch (Error& e) {
		if (e.code() == error_code_encrypt_keys_fetch_failed || e.code() == error_code_encrypt_key_not_found) {
			ASSERT(!isReservedEncryptDomain(blockDomainId));
			TraceEvent(SevWarnAlways, "SnapshotRestoreEncryptKeyFetchFailed").detail("TenantId", blockDomainId);
			CODE_PROBE(true, "Snapshot restore encrypt keys not found", probe::decoration::rare);
		} else if (e.code() == error_code_tenant_not_found) {
			ASSERT(!isReservedEncryptDomain(blockDomainId));
			TraceEvent(SevWarnAlways, "EncryptedSnapshotRestoreTenantNotFound").detail("TenantId", blockDomainId);
			CODE_PROBE(true, "Encrypted Snapshot restore tenant not found", probe::decoration::rare);
		}
		TraceEvent(SevWarn, "FileRestoreDecodeRangeFileBlockFailed")
		    .error(e)
		    .detail("Filename", file->getFilename())
		    .detail("BlockOffset", offset)
		    .detail("BlockLen", len)
		    .detail("ErrorRelativeOffset", reader.rptr - buf.begin())
		    .detail("ErrorAbsoluteOffset", reader.rptr - buf.begin() + offset);
		throw;
	}
}

// Very simple format compared to KeyRange files.
// Header, [Key, Value]... Key len
struct LogFileWriter {
	LogFileWriter(Reference<IBackupFile> file = Reference<IBackupFile>(), int blockSize = 0)
	  : file(file), blockSize(blockSize), blockEnd(0) {}

	// Start a new block if needed, then write the key and value
	ACTOR static Future<Void> writeKV_impl(LogFileWriter* self, Key k, Value v) {
		// If key and value do not fit in this block, end it and start a new one
		int toWrite = sizeof(int32_t) + k.size() + sizeof(int32_t) + v.size();
		if (self->file->size() + toWrite > self->blockEnd) {
			// Write padding if needed
			int bytesLeft = self->blockEnd - self->file->size();
			if (bytesLeft > 0) {
				state Value paddingFFs = makePadding(bytesLeft);
				wait(self->file->append(paddingFFs.begin(), bytesLeft));
			}

			// Set new blockEnd
			self->blockEnd += self->blockSize;

			// write the block header
			wait(self->file->append((uint8_t*)&BACKUP_AGENT_MLOG_VERSION, sizeof(BACKUP_AGENT_MLOG_VERSION)));
		}

		wait(self->file->appendStringRefWithLen(k));
		wait(self->file->appendStringRefWithLen(v));

		// At this point we should be in whatever the current block is or the block size is too small
		if (self->file->size() > self->blockEnd)
			throw backup_bad_block_size();

		return Void();
	}

	Future<Void> writeKV(Key k, Value v) { return writeKV_impl(this, k, v); }

	Reference<IBackupFile> file;
	int blockSize;

private:
	int64_t blockEnd;
};

// input: a string of [param1, param2], [param1, param2] ..., [param1, param2]
// output: a vector of [param1, param2] after removing the length info
Standalone<VectorRef<KeyValueRef>> decodeMutationLogFileBlock(const Standalone<StringRef>& buf) {
	Standalone<VectorRef<KeyValueRef>> results({}, buf.arena());
	StringRefReader reader(buf, restore_corrupted_data());

	// Read header, currently only decoding version BACKUP_AGENT_MLOG_VERSION
	if (reader.consume<int32_t>() != BACKUP_AGENT_MLOG_VERSION)
		throw restore_unsupported_file_version();

	// Read k/v pairs.  Block ends either at end of last value exactly or with 0xFF as first key len byte.
	while (1) {
		// If eof reached or first key len bytes is 0xFF then end of block was reached.
		if (reader.eof() || *reader.rptr == 0xFF)
			break;

		// Read key and value.  If anything throws then there is a problem.
		uint32_t kLen = reader.consumeNetworkUInt32();
		const uint8_t* k = reader.consume(kLen);
		uint32_t vLen = reader.consumeNetworkUInt32();
		const uint8_t* v = reader.consume(vLen);

		results.push_back(results.arena(), KeyValueRef(KeyRef(k, kLen), ValueRef(v, vLen)));
	}

	// Make sure any remaining bytes in the block are 0xFF
	for (auto b : reader.remainder())
		if (b != 0xFF)
			throw restore_corrupted_data_padding();

	return results;
}

ACTOR Future<Standalone<VectorRef<KeyValueRef>>> decodeMutationLogFileBlock(Reference<IAsyncFile> file,
                                                                            int64_t offset,
                                                                            int len) {
	state Standalone<StringRef> buf = makeString(len);
	int rLen = wait(file->read(mutateString(buf), len, offset));
	if (rLen != len)
		throw restore_bad_read();

	try {
		return decodeMutationLogFileBlock(buf);
	} catch (Error& e) {
		TraceEvent(SevWarn, "FileRestoreCorruptLogFileBlock")
		    .error(e)
		    .detail("Filename", file->getFilename())
		    .detail("BlockOffset", offset)
		    .detail("BlockLen", len);
		throw;
	}
}

ACTOR Future<Void> checkTaskVersion(Database cx, Reference<Task> task, StringRef name, uint32_t version) {
	uint32_t taskVersion = task->getVersion();
	if (taskVersion > version) {
		state Error err = task_invalid_version();

		TraceEvent(SevWarn, "BA_BackupRangeTaskFuncExecute")
		    .detail("TaskVersion", taskVersion)
		    .detail("Name", name)
		    .detail("Version", version);
		if (KeyBackedTaskConfig::TaskParams.uid().exists(task)) {
			std::string msg = format("%s task version `%lu' is greater than supported version `%lu'",
			                         task->params[Task::reservedTaskParamKeyType].toString().c_str(),
			                         (unsigned long)taskVersion,
			                         (unsigned long)version);
			wait(BackupConfig(task).logError(cx, err, msg));
		}

		throw err;
	}

	return Void();
}

ACTOR static Future<Void> abortFiveZeroBackup(FileBackupAgent* backupAgent,
                                              Reference<ReadYourWritesTransaction> tr,
                                              std::string tagName) {
	tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
	tr->setOption(FDBTransactionOptions::LOCK_AWARE);

	state Subspace tagNames = backupAgent->subspace.get(BackupAgentBase::keyTagName);
	Optional<Value> uidStr = wait(tr->get(tagNames.pack(Key(tagName))));
	if (!uidStr.present()) {
		TraceEvent(SevWarn, "FileBackupAbortIncompatibleBackup_TagNotFound").detail("TagName", tagName.c_str());
		return Void();
	}
	state UID uid = BinaryReader::fromStringRef<UID>(uidStr.get(), Unversioned());

	state Subspace statusSpace = backupAgent->subspace.get(BackupAgentBase::keyStates).get(uid.toString());
	state Subspace globalConfig = backupAgent->subspace.get(BackupAgentBase::keyConfig).get(uid.toString());
	state Subspace newConfigSpace = uidPrefixKey("uid->config/"_sr.withPrefix(fileBackupPrefixRange.begin), uid);

	Optional<Value> statusStr = wait(tr->get(statusSpace.pack(FileBackupAgent::keyStateStatus)));
	state EBackupState status =
	    !statusStr.present() ? EBackupState::STATE_NEVERRAN : BackupAgentBase::getState(statusStr.get().toString());

	TraceEvent(SevInfo, "FileBackupAbortIncompatibleBackup")
	    .detail("TagName", tagName.c_str())
	    .detail("Status", BackupAgentBase::getStateText(status));

	// Clear the folder id to prevent future tasks from executing at all
	tr->clear(singleKeyRange(StringRef(globalConfig.pack(FileBackupAgent::keyFolderId))));

	// Clear the mutations logging config and data
	Key configPath = uidPrefixKey(logRangesRange.begin, uid);
	Key logsPath = uidPrefixKey(backupLogKeys.begin, uid);
	tr->clear(KeyRangeRef(configPath, strinc(configPath)));
	tr->clear(KeyRangeRef(logsPath, strinc(logsPath)));

	// Clear the new-style config space
	tr->clear(newConfigSpace.range());

	Key statusKey = StringRef(statusSpace.pack(FileBackupAgent::keyStateStatus));

	// Set old style state key to Aborted if it was Runnable
	if (backupAgent->isRunnable(status))
		tr->set(statusKey, StringRef(FileBackupAgent::getStateText(EBackupState::STATE_ABORTED)));

	return Void();
}

struct AbortFiveZeroBackupTask : TaskFuncBase {
	static StringRef name;
	ACTOR static Future<Void> _finish(Reference<ReadYourWritesTransaction> tr,
	                                  Reference<TaskBucket> taskBucket,
	                                  Reference<FutureBucket> futureBucket,
	                                  Reference<Task> task) {
		state FileBackupAgent backupAgent;
		state std::string tagName = task->params[BackupAgentBase::keyConfigBackupTag].toString();

		TraceEvent(SevInfo, "FileBackupCancelOldTask")
		    .detail("Task", task->params[Task::reservedTaskParamKeyType])
		    .detail("TagName", tagName);
		wait(abortFiveZeroBackup(&backupAgent, tr, tagName));

		wait(taskBucket->finish(tr, task));
		return Void();
	}

	StringRef getName() const override {
		TraceEvent(SevError, "FileBackupError")
		    .detail("Cause", "AbortFiveZeroBackupTaskFunc::name() should never be called");
		ASSERT(false);
		return StringRef();
	}

	Future<Void> execute(Database cx,
	                     Reference<TaskBucket> tb,
	                     Reference<FutureBucket> fb,
	                     Reference<Task> task) override {
		return Future<Void>(Void());
	};
	Future<Void> finish(Reference<ReadYourWritesTransaction> tr,
	                    Reference<TaskBucket> tb,
	                    Reference<FutureBucket> fb,
	                    Reference<Task> task) override {
		return _finish(tr, tb, fb, task);
	};
};
StringRef AbortFiveZeroBackupTask::name = "abort_legacy_backup"_sr;
REGISTER_TASKFUNC(AbortFiveZeroBackupTask);
REGISTER_TASKFUNC_ALIAS(AbortFiveZeroBackupTask, file_backup_diff_logs);
REGISTER_TASKFUNC_ALIAS(AbortFiveZeroBackupTask, file_backup_log_range);
REGISTER_TASKFUNC_ALIAS(AbortFiveZeroBackupTask, file_backup_logs);
REGISTER_TASKFUNC_ALIAS(AbortFiveZeroBackupTask, file_backup_range);
REGISTER_TASKFUNC_ALIAS(AbortFiveZeroBackupTask, file_backup_restorable);
REGISTER_TASKFUNC_ALIAS(AbortFiveZeroBackupTask, file_finish_full_backup);
REGISTER_TASKFUNC_ALIAS(AbortFiveZeroBackupTask, file_finished_full_backup);
REGISTER_TASKFUNC_ALIAS(AbortFiveZeroBackupTask, file_start_full_backup);

ACTOR static Future<Void> abortFiveOneBackup(FileBackupAgent* backupAgent,
                                             Reference<ReadYourWritesTransaction> tr,
                                             std::string tagName) {
	tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
	tr->setOption(FDBTransactionOptions::LOCK_AWARE);

	state KeyBackedTag tag = makeBackupTag(tagName);
	state UidAndAbortedFlagT current = wait(tag.getOrThrow(tr, Snapshot::False, backup_unneeded()));

	state BackupConfig config(current.first);
	EBackupState status = wait(config.stateEnum().getD(tr, Snapshot::False, EBackupState::STATE_NEVERRAN));

	if (!backupAgent->isRunnable(status)) {
		throw backup_unneeded();
	}

	TraceEvent(SevInfo, "FBA_AbortFileOneBackup")
	    .detail("TagName", tagName.c_str())
	    .detail("Status", BackupAgentBase::getStateText(status));

	// Cancel backup task through tag
	wait(tag.cancel(tr));

	Key configPath = uidPrefixKey(logRangesRange.begin, config.getUid());
	Key logsPath = uidPrefixKey(backupLogKeys.begin, config.getUid());

	tr->clear(KeyRangeRef(configPath, strinc(configPath)));
	tr->clear(KeyRangeRef(logsPath, strinc(logsPath)));

	config.stateEnum().set(tr, EBackupState::STATE_ABORTED);

	return Void();
}

struct AbortFiveOneBackupTask : TaskFuncBase {
	static StringRef name;
	ACTOR static Future<Void> _finish(Reference<ReadYourWritesTransaction> tr,
	                                  Reference<TaskBucket> taskBucket,
	                                  Reference<FutureBucket> futureBucket,
	                                  Reference<Task> task) {
		state FileBackupAgent backupAgent;
		state BackupConfig config(task);
		state std::string tagName = wait(config.tag().getOrThrow(tr));

		TraceEvent(SevInfo, "FileBackupCancelFiveOneTask")
		    .detail("Task", task->params[Task::reservedTaskParamKeyType])
		    .detail("TagName", tagName);
		wait(abortFiveOneBackup(&backupAgent, tr, tagName));

		wait(taskBucket->finish(tr, task));
		return Void();
	}

	StringRef getName() const override {
		TraceEvent(SevError, "FileBackupError")
		    .detail("Cause", "AbortFiveOneBackupTaskFunc::name() should never be called");
		ASSERT(false);
		return StringRef();
	}

	Future<Void> execute(Database cx,
	                     Reference<TaskBucket> tb,
	                     Reference<FutureBucket> fb,
	                     Reference<Task> task) override {
		return Future<Void>(Void());
	};
	Future<Void> finish(Reference<ReadYourWritesTransaction> tr,
	                    Reference<TaskBucket> tb,
	                    Reference<FutureBucket> fb,
	                    Reference<Task> task) override {
		return _finish(tr, tb, fb, task);
	};
};
StringRef AbortFiveOneBackupTask::name = "abort_legacy_backup_5.2"_sr;
REGISTER_TASKFUNC(AbortFiveOneBackupTask);
REGISTER_TASKFUNC_ALIAS(AbortFiveOneBackupTask, file_backup_write_range);
REGISTER_TASKFUNC_ALIAS(AbortFiveOneBackupTask, file_backup_dispatch_ranges);
REGISTER_TASKFUNC_ALIAS(AbortFiveOneBackupTask, file_backup_write_logs);
REGISTER_TASKFUNC_ALIAS(AbortFiveOneBackupTask, file_backup_erase_logs);
REGISTER_TASKFUNC_ALIAS(AbortFiveOneBackupTask, file_backup_dispatch_logs);
REGISTER_TASKFUNC_ALIAS(AbortFiveOneBackupTask, file_backup_finished);
REGISTER_TASKFUNC_ALIAS(AbortFiveOneBackupTask, file_backup_write_snapshot_manifest);
REGISTER_TASKFUNC_ALIAS(AbortFiveOneBackupTask, file_backup_start);

std::function<void(Reference<Task>)> NOP_SETUP_TASK_FN = [](Reference<Task> task) { /* NOP */ };
ACTOR static Future<Key> addBackupTask(StringRef name,
                                       uint32_t version,
                                       Reference<ReadYourWritesTransaction> tr,
                                       Reference<TaskBucket> taskBucket,
                                       TaskCompletionKey completionKey,
                                       BackupConfig config,
                                       Reference<TaskFuture> waitFor = Reference<TaskFuture>(),
                                       std::function<void(Reference<Task>)> setupTaskFn = NOP_SETUP_TASK_FN,
                                       int priority = 0,
                                       SetValidation setValidation = SetValidation::True) {
	tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
	tr->setOption(FDBTransactionOptions::LOCK_AWARE);

	Key doneKey = wait(completionKey.get(tr, taskBucket));
	state Reference<Task> task(new Task(name, version, doneKey, priority));

	// Bind backup config to new task
	// allow this new task to find the config(keyspace) of the parent task
	wait(config.toTask(tr, task, setValidation));

	// Set task specific params
	setupTaskFn(task);

	if (!waitFor) {
		return taskBucket->addTask(tr, task);
	}
	wait(waitFor->onSetAddTask(tr, taskBucket, task));

	return "OnSetAddTask"_sr;
}

// Clears the backup ID from "backupStartedKey" to pause backup workers.
ACTOR static Future<Void> clearBackupStartID(Reference<ReadYourWritesTransaction> tr, UID backupUid) {
	// If backup worker is not enabled, exit early.
	Optional<Value> started = wait(tr->get(backupStartedKey));
	std::vector<std::pair<UID, Version>> ids;
	if (started.present()) {
		ids = decodeBackupStartedValue(started.get());
	}
	auto it =
	    std::find_if(ids.begin(), ids.end(), [=](const std::pair<UID, Version>& p) { return p.first == backupUid; });
	if (it != ids.end()) {
		ids.erase(it);
	}

	if (ids.empty()) {
		TraceEvent("ClearBackup").detail("BackupID", backupUid);
		tr->clear(backupStartedKey);
	} else {
		tr->set(backupStartedKey, encodeBackupStartedValue(ids));
	}
	return Void();
}

// Backup and Restore taskFunc definitions will inherit from one of the following classes which
// servers to catch and log to the appropriate config any error that execute/finish didn't catch and log.
struct RestoreTaskFuncBase : TaskFuncBase {
	Future<Void> handleError(Database cx, Reference<Task> task, Error const& error) final {
		return RestoreConfig(task).logError(
		    cx,
		    error,
		    format("'%s' on '%s'", error.what(), task->params[Task::reservedTaskParamKeyType].printable().c_str()));
	}
	virtual std::string toString(Reference<Task> task) const { return ""; }
};

struct BackupTaskFuncBase : TaskFuncBase {
	Future<Void> handleError(Database cx, Reference<Task> task, Error const& error) final {
		return BackupConfig(task).logError(
		    cx,
		    error,
		    format("'%s' on '%s'", error.what(), task->params[Task::reservedTaskParamKeyType].printable().c_str()));
	}
	virtual std::string toString(Reference<Task> task) const { return ""; }
};

ACTOR static Future<Standalone<VectorRef<KeyRef>>> getBlockOfShards(Reference<ReadYourWritesTransaction> tr,
                                                                    Key beginKey,
                                                                    Key endKey,
                                                                    int limit) {

	tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
	tr->setOption(FDBTransactionOptions::LOCK_AWARE);
	state Standalone<VectorRef<KeyRef>> results;
	RangeResult values = wait(tr->getRange(
	    KeyRangeRef(keyAfter(beginKey.withPrefix(keyServersPrefix)), endKey.withPrefix(keyServersPrefix)), limit));

	for (auto& s : values) {
		KeyRef k = s.key.removePrefix(keyServersPrefix);
		results.push_back_deep(results.arena(), k);
	}

	return results;
}

struct BackupRangeTaskFunc : BackupTaskFuncBase {
	static StringRef name;
	static constexpr uint32_t version = 1;

	static struct {
		static TaskParam<Key> beginKey() { return __FUNCTION__sr; }
		static TaskParam<Key> endKey() { return __FUNCTION__sr; }
		static TaskParam<bool> addBackupRangeTasks() { return __FUNCTION__sr; }
	} Params;

	std::string toString(Reference<Task> task) const override {
		return format("beginKey '%s' endKey '%s' addTasks %d",
		              Params.beginKey().get(task).printable().c_str(),
		              Params.endKey().get(task).printable().c_str(),
		              Params.addBackupRangeTasks().get(task));
	}

	StringRef getName() const override { return name; };

	Future<Void> execute(Database cx,
	                     Reference<TaskBucket> tb,
	                     Reference<FutureBucket> fb,
	                     Reference<Task> task) override {
		return _execute(cx, tb, fb, task);
	};
	Future<Void> finish(Reference<ReadYourWritesTransaction> tr,
	                    Reference<TaskBucket> tb,
	                    Reference<FutureBucket> fb,
	                    Reference<Task> task) override {
		return _finish(tr, tb, fb, task);
	};

	// Finish (which flushes/syncs) the file, and then in a single transaction, make some range backup progress
	// durable. This means:
	//  - increment the backup config's range bytes written
	//  - update the range file map
	//  - update the task begin key
	//  - save/extend the task with the new params
	// Returns whether or not the caller should continue executing the task.
	ACTOR static Future<bool> finishRangeFile(Reference<IBackupFile> file,
	                                          Database cx,
	                                          Reference<Task> task,
	                                          Reference<TaskBucket> taskBucket,
	                                          KeyRange range,
	                                          Version version) {
		wait(file->finish());

		// Ignore empty ranges.
		if (range.empty())
			return false;

		state Reference<ReadYourWritesTransaction> tr(new ReadYourWritesTransaction(cx));
		state BackupConfig backup(task);
		state bool usedFile = false;

		// Avoid unnecessary conflict by prevent taskbucket's automatic timeout extension
		// because the following transaction loop extends and updates the task.
		wait(task->extendMutex.take());
		state FlowLock::Releaser releaser(task->extendMutex, 1);

		loop {
			try {
				tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
				tr->setOption(FDBTransactionOptions::LOCK_AWARE);

				// Update the start key of the task so if this transaction completes but the task then fails
				// when it is restarted it will continue where this execution left off.
				Params.beginKey().set(task, range.end);

				// Save and extend the task with the new begin parameter
				state Version newTimeout = wait(taskBucket->extendTimeout(tr, task, UpdateParams::True));

				// Update the range bytes written in the backup config
				backup.rangeBytesWritten().atomicOp(tr, file->size(), MutationRef::AddValue);
				backup.snapshotRangeFileCount().atomicOp(tr, 1, MutationRef::AddValue);

				// See if there is already a file for this key which has an earlier begin, update the map if not.
				Optional<BackupConfig::RangeSlice> s = wait(backup.snapshotRangeFileMap().get(tr, range.end));
				if (!s.present() || s.get().begin >= range.begin) {
					backup.snapshotRangeFileMap().set(
					    tr, range.end, { range.begin, version, file->getFileName(), file->size() });
					usedFile = true;
				}

				wait(tr->commit());
				task->timeoutVersion = newTimeout;
				break;
			} catch (Error& e) {
				wait(tr->onError(e));
			}
		}

		return usedFile;
	}

	ACTOR static Future<Key> addTask(Reference<ReadYourWritesTransaction> tr,
	                                 Reference<TaskBucket> taskBucket,
	                                 Reference<Task> parentTask,
	                                 int priority,
	                                 Key begin,
	                                 Key end,
	                                 TaskCompletionKey completionKey,
	                                 Reference<TaskFuture> waitFor = Reference<TaskFuture>(),
	                                 Version scheduledVersion = invalidVersion) {
		Key key = wait(addBackupTask(
		    BackupRangeTaskFunc::name,
		    BackupRangeTaskFunc::version,
		    tr,
		    taskBucket,
		    completionKey,
		    BackupConfig(parentTask),
		    waitFor,
		    [=](Reference<Task> task) {
			    Params.beginKey().set(task, begin);
			    Params.endKey().set(task, end);
			    Params.addBackupRangeTasks().set(task, false);
			    if (scheduledVersion != invalidVersion)
				    ReservedTaskParams::scheduledVersion().set(task, scheduledVersion);
		    },
		    priority));
		return key;
	}

	ACTOR static Future<Void> _execute(Database cx,
	                                   Reference<TaskBucket> taskBucket,
	                                   Reference<FutureBucket> futureBucket,
	                                   Reference<Task> task) {
		state Reference<FlowLock> lock(new FlowLock(CLIENT_KNOBS->BACKUP_LOCK_BYTES));

		wait(checkTaskVersion(cx, task, BackupRangeTaskFunc::name, BackupRangeTaskFunc::version));

		state Key beginKey = Params.beginKey().get(task);
		state Key endKey = Params.endKey().get(task);

		TraceEvent("FileBackupRangeStart")
		    .suppressFor(60)
		    .detail("BackupUID", BackupConfig(task).getUid())
		    .detail("BeginKey", Params.beginKey().get(task).printable())
		    .detail("EndKey", Params.endKey().get(task).printable())
		    .detail("TaskKey", task->key.printable());

		// When a key range task saves the last chunk of progress and then the executor dies, when the task
		// continues its beginKey and endKey will be equal but there is no work to be done.
		if (beginKey == endKey)
			return Void();

		// Find out if there is a shard boundary in(beginKey, endKey)
		Standalone<VectorRef<KeyRef>> keys = wait(runRYWTransaction(
		    cx, [=](Reference<ReadYourWritesTransaction> tr) { return getBlockOfShards(tr, beginKey, endKey, 1); }));
		if (keys.size() > 0) {
			Params.addBackupRangeTasks().set(task, true);
			return Void();
		}

		// Read everything from beginKey to endKey, write it to an output file, run the output file processor, and
		// then set on_done. If we are still writing after X seconds, end the output file and insert a new
		// backup_range task for the remainder.
		state Reference<IBackupFile> outFile;
		state Version outVersion = invalidVersion;
		state Key lastKey;

		// retrieve kvData
		state PromiseStream<RangeResultWithVersion> results;

		state Future<Void> rc = readCommitted(cx,
		                                      results,
		                                      lock,
		                                      KeyRangeRef(beginKey, endKey),
		                                      Terminator::True,
		                                      AccessSystemKeys::True,
		                                      LockAware::True);
		state std::unique_ptr<IRangeFileWriter> rangeFile;
		state BackupConfig backup(task);
		state Arena arena;

		DatabaseConfiguration config = wait(getDatabaseConfiguration(cx));
		state EncryptionAtRestMode encryptMode = config.encryptionAtRestMode;
		state Optional<Reference<TenantEntryCache<Void>>> tenantCache;
		if (encryptMode.mode == EncryptionAtRestMode::DOMAIN_AWARE) {
			tenantCache = makeReference<TenantEntryCache<Void>>(cx, TenantEntryCacheRefreshMode::WATCH);
			wait(tenantCache.get()->init());
		}

		// Don't need to check keepRunning(task) here because we will do that while finishing each output file, but
		// if bc is false then clearly the backup is no longer in progress
		Reference<IBackupContainer> _bc = wait(backup.backupContainer().getD(cx.getReference()));
		if (!_bc) {
			return Void();
		}
		state Reference<IBackupContainer> bc = getBackupContainerWithProxy(_bc);
		state bool done = false;
		state int64_t nrKeys = 0;
		state Optional<bool> encryptionEnabled;

		loop {
			state RangeResultWithVersion values;
			try {
				RangeResultWithVersion _values = waitNext(results.getFuture());
				values = _values;
				lock->release(values.first.expectedSize());
			} catch (Error& e) {
				if (e.code() == error_code_end_of_stream)
					done = true;
				else
					throw;
			}

			// If we've seen a new read version OR hit the end of the stream, then if we were writing a file finish
			// it.
			if (values.second != outVersion || done) {
				if (outFile) {
					CODE_PROBE(outVersion != invalidVersion, "Backup range task wrote multiple versions");
					state Key nextKey = done ? endKey : keyAfter(lastKey);
					wait(rangeFile->writeKey(nextKey));

					if (BUGGIFY) {
						wait(rangeFile->padEnd(true));
					}

					wait(rangeFile->finish());

					bool usedFile = wait(
					    finishRangeFile(outFile, cx, task, taskBucket, KeyRangeRef(beginKey, nextKey), outVersion));
					TraceEvent("FileBackupWroteRangeFile")
					    .suppressFor(60)
					    .detail("BackupUID", backup.getUid())
					    .detail("Size", outFile->size())
					    .detail("Keys", nrKeys)
					    .detail("ReadVersion", outVersion)
					    .detail("BeginKey", beginKey.printable())
					    .detail("EndKey", nextKey.printable())
					    .detail("AddedFileToMap", usedFile);

					nrKeys = 0;
					beginKey = nextKey;
				}

				if (done)
					return Void();

				// Start writing a new file after verifying this task should keep running as of a new read version
				// (which must be >= outVersion)
				outVersion = values.second;
				// block size must be at least large enough for 3 max size keys and 2 max size values + overhead so
				// 250k conservatively.
				state int blockSize =
				    BUGGIFY ? deterministicRandom()->randomInt(250e3, 4e6) : CLIENT_KNOBS->BACKUP_RANGEFILE_BLOCK_SIZE;
				state Version snapshotBeginVersion;
				state int64_t snapshotRangeFileCount;

				state Reference<ReadYourWritesTransaction> tr(new ReadYourWritesTransaction(cx));
				loop {
					try {
						tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
						tr->setOption(FDBTransactionOptions::LOCK_AWARE);

						wait(taskBucket->keepRunning(tr, task) &&
						     storeOrThrow(snapshotBeginVersion, backup.snapshotBeginVersion().get(tr)) &&
						     store(encryptionEnabled, backup.enableSnapshotBackupEncryption().get(tr)) &&
						     store(snapshotRangeFileCount, backup.snapshotRangeFileCount().getD(tr)));

						break;
					} catch (Error& e) {
						wait(tr->onError(e));
					}
				}

				Reference<IBackupFile> f =
				    wait(bc->writeRangeFile(snapshotBeginVersion, snapshotRangeFileCount, outVersion, blockSize));
				outFile = f;

				if (!encryptionEnabled.present() || !encryptionEnabled.get()) {
					encryptMode = EncryptionAtRestMode::DISABLED;
				}
				TraceEvent(SevDebug, "EncryptionMode").detail("EncryptMode", encryptMode.toString());
				// Initialize range file writer and write begin key
				if (encryptMode.mode != EncryptionAtRestMode::DISABLED) {
					CODE_PROBE(true, "using encrypted snapshot file writer", probe::decoration::rare);
					rangeFile = std::make_unique<EncryptedRangeFileWriter>(
					    cx, &arena, encryptMode, tenantCache, outFile, blockSize);
				} else {
					rangeFile = std::make_unique<RangeFileWriter>(outFile, blockSize);
				}
				wait(rangeFile->writeKey(beginKey));
			}

			// write kvData to file, update lastKey and key count
			if (values.first.size() != 0) {
				state size_t i = 0;
				for (; i < values.first.size(); ++i) {
					wait(rangeFile->writeKV(values.first[i].key, values.first[i].value));
				}
				lastKey = values.first.back().key;
				nrKeys += values.first.size();
			}
		}
	}

	ACTOR static Future<Void> startBackupRangeInternal(Reference<ReadYourWritesTransaction> tr,
	                                                   Reference<TaskBucket> taskBucket,
	                                                   Reference<FutureBucket> futureBucket,
	                                                   Reference<Task> task,
	                                                   Reference<TaskFuture> onDone) {
		tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
		tr->setOption(FDBTransactionOptions::LOCK_AWARE);
		state Key nextKey = Params.beginKey().get(task);
		state Key endKey = Params.endKey().get(task);

		state Standalone<VectorRef<KeyRef>> keys =
		    wait(getBlockOfShards(tr, nextKey, endKey, CLIENT_KNOBS->BACKUP_SHARD_TASK_LIMIT));

		std::vector<Future<Key>> addTaskVector;
		for (int idx = 0; idx < keys.size(); ++idx) {
			if (nextKey != keys[idx]) {
				addTaskVector.push_back(addTask(tr,
				                                taskBucket,
				                                task,
				                                task->getPriority(),
				                                nextKey,
				                                keys[idx],
				                                TaskCompletionKey::joinWith(onDone)));
				TraceEvent("FileBackupRangeSplit")
				    .suppressFor(60)
				    .detail("BackupUID", BackupConfig(task).getUid())
				    .detail("BeginKey", Params.beginKey().get(task).printable())
				    .detail("EndKey", Params.endKey().get(task).printable())
				    .detail("SliceBeginKey", nextKey.printable())
				    .detail("SliceEndKey", keys[idx].printable());
			}
			nextKey = keys[idx];
		}

		wait(waitForAll(addTaskVector));

		if (nextKey != endKey) {
			// Add task to cover nextKey to the end, using the priority of the current task
			wait(success(addTask(tr,
			                     taskBucket,
			                     task,
			                     task->getPriority(),
			                     nextKey,
			                     endKey,
			                     TaskCompletionKey::joinWith(onDone),
			                     Reference<TaskFuture>(),
			                     task->getPriority())));
		}

		return Void();
	}

	ACTOR static Future<Void> _finish(Reference<ReadYourWritesTransaction> tr,
	                                  Reference<TaskBucket> taskBucket,
	                                  Reference<FutureBucket> futureBucket,
	                                  Reference<Task> task) {
		state Reference<TaskFuture> taskFuture = futureBucket->unpack(task->params[Task::reservedTaskParamKeyDone]);
		if (Params.addBackupRangeTasks().get(task)) {
			wait(startBackupRangeInternal(tr, taskBucket, futureBucket, task, taskFuture));
		} else {
			wait(taskFuture->set(tr, taskBucket));
		}

		wait(taskBucket->finish(tr, task));

		TraceEvent("FileBackupRangeFinish")
		    .suppressFor(60)
		    .detail("BackupUID", BackupConfig(task).getUid())
		    .detail("BeginKey", Params.beginKey().get(task).printable())
		    .detail("EndKey", Params.endKey().get(task).printable())
		    .detail("TaskKey", task->key.printable());

		return Void();
	}
};
StringRef BackupRangeTaskFunc::name = "file_backup_write_range_5.2"_sr;
REGISTER_TASKFUNC(BackupRangeTaskFunc);

struct BackupSnapshotDispatchTask : BackupTaskFuncBase {
	static StringRef name;
	static constexpr uint32_t version = 1;

	static struct {
		// Set by Execute, used by Finish
		static TaskParam<int64_t> shardsBehind() { return __FUNCTION__sr; }
		// Set by Execute, used by Finish
		static TaskParam<bool> snapshotFinished() { return __FUNCTION__sr; }
		// Set by Execute, used by Finish
		static TaskParam<Version> nextDispatchVersion() { return __FUNCTION__sr; }
	} Params;

	StringRef getName() const override { return name; };

	Future<Void> execute(Database cx,
	                     Reference<TaskBucket> tb,
	                     Reference<FutureBucket> fb,
	                     Reference<Task> task) override {
		return _execute(cx, tb, fb, task);
	};
	Future<Void> finish(Reference<ReadYourWritesTransaction> tr,
	                    Reference<TaskBucket> tb,
	                    Reference<FutureBucket> fb,
	                    Reference<Task> task) override {
		return _finish(tr, tb, fb, task);
	};

	ACTOR static Future<Key> addTask(Reference<ReadYourWritesTransaction> tr,
	                                 Reference<TaskBucket> taskBucket,
	                                 Reference<Task> parentTask,
	                                 int priority,
	                                 TaskCompletionKey completionKey,
	                                 Reference<TaskFuture> waitFor = Reference<TaskFuture>(),
	                                 Version scheduledVersion = invalidVersion) {
		Key key = wait(addBackupTask(
		    name,
		    version,
		    tr,
		    taskBucket,
		    completionKey,
		    BackupConfig(parentTask),
		    waitFor,
		    [=](Reference<Task> task) {
			    if (scheduledVersion != invalidVersion)
				    ReservedTaskParams::scheduledVersion().set(task, scheduledVersion);
		    },
		    priority));
		return key;
	}

	enum DispatchState { SKIP = 0, DONE = 1, NOT_DONE_MIN = 2 };

	ACTOR static Future<Void> _execute(Database cx,
	                                   Reference<TaskBucket> taskBucket,
	                                   Reference<FutureBucket> futureBucket,
	                                   Reference<Task> task) {
		state Reference<FlowLock> lock(new FlowLock(CLIENT_KNOBS->BACKUP_LOCK_BYTES));
		wait(checkTaskVersion(cx, task, name, version));

		state double startTime = timer();
		state Reference<ReadYourWritesTransaction> tr(new ReadYourWritesTransaction(cx));

		// The shard map will use 3 values classes.  Exactly SKIP, exactly DONE, then any number >= NOT_DONE_MIN
		// which will mean not done. This is to enable an efficient coalesce() call to squash adjacent ranges which
		// are not yet finished to enable efficiently finding random database shards which are not done.
		state int notDoneSequence = NOT_DONE_MIN;
		state KeyRangeMap<int> shardMap(notDoneSequence++);
		state Key beginKey = allKeys.begin;

		// Read all shard boundaries and add them to the map
		loop {
			try {
				tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
				tr->setOption(FDBTransactionOptions::LOCK_AWARE);

				state Future<Standalone<VectorRef<KeyRef>>> shardBoundaries =
				    getBlockOfShards(tr, beginKey, allKeys.end, CLIENT_KNOBS->TOO_MANY);
				wait(success(shardBoundaries) && taskBucket->keepRunning(tr, task));

				if (shardBoundaries.get().size() == 0)
					break;

				for (auto& boundary : shardBoundaries.get()) {
					shardMap.rawInsert(boundary, notDoneSequence++);
				}

				beginKey = keyAfter(shardBoundaries.get().back());
				tr->reset();
			} catch (Error& e) {
				wait(tr->onError(e));
			}
		}

		// Read required stuff from backup config
		state BackupConfig config(task);
		state Version recentReadVersion;
		state Version snapshotBeginVersion;
		state Version snapshotTargetEndVersion;
		state int64_t snapshotIntervalSeconds;
		state Optional<Version> latestSnapshotEndVersion;
		state std::vector<KeyRange> backupRanges;
		state Optional<Key> snapshotBatchFutureKey;
		state Reference<TaskFuture> snapshotBatchFuture;
		state Optional<int64_t> snapshotBatchSize;

		tr->reset();
		loop {
			try {
				tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
				tr->setOption(FDBTransactionOptions::LOCK_AWARE);

				wait(store(snapshotBeginVersion, config.snapshotBeginVersion().getOrThrow(tr)) &&
				     store(snapshotTargetEndVersion, config.snapshotTargetEndVersion().getOrThrow(tr)) &&
				     store(backupRanges, config.backupRanges().getOrThrow(tr)) &&
				     store(snapshotIntervalSeconds, config.snapshotIntervalSeconds().getOrThrow(tr))
				     // The next two parameters are optional
				     && store(snapshotBatchFutureKey, config.snapshotBatchFuture().get(tr)) &&
				     store(snapshotBatchSize, config.snapshotBatchSize().get(tr)) &&
				     store(latestSnapshotEndVersion, config.latestSnapshotEndVersion().get(tr)) &&
				     store(recentReadVersion, tr->getReadVersion()) && taskBucket->keepRunning(tr, task));

				// If the snapshot batch future key does not exist, this is the first execution of this dispatch
				// task so
				//    - create and set the snapshot batch future key
				//    - initialize the batch size to 0
				//    - initialize the target snapshot end version if it is not yet set
				//    - commit
				if (!snapshotBatchFutureKey.present()) {
					snapshotBatchFuture = futureBucket->future(tr);
					config.snapshotBatchFuture().set(tr, snapshotBatchFuture->pack());
					snapshotBatchSize = 0;
					config.snapshotBatchSize().set(tr, snapshotBatchSize.get());

					// The dispatch of this batch can take multiple separate executions if the executor fails
					// so store a completion key for the dispatch finish() to set when dispatching the batch is
					// done.
					state TaskCompletionKey dispatchCompletionKey = TaskCompletionKey::joinWith(snapshotBatchFuture);
					// this is a bad hack - but flow doesn't work well with lambda functions and capturing
					// state variables...
					auto cfg = &config;
					auto tx = &tr;
					wait(map(dispatchCompletionKey.get(tr, taskBucket), [cfg, tx](Key const& k) {
						cfg->snapshotBatchDispatchDoneKey().set(*tx, k);
						return Void();
					}));
					wait(tr->commit());
				} else {
					ASSERT(snapshotBatchSize.present());
					// Batch future key exists in the config so create future from it
					snapshotBatchFuture = makeReference<TaskFuture>(futureBucket, snapshotBatchFutureKey.get());
				}

				break;
			} catch (Error& e) {
				wait(tr->onError(e));
			}
		}

		// Read all dispatched ranges
		state std::vector<std::pair<Key, bool>> dispatchBoundaries;
		tr->reset();
		beginKey = allKeys.begin;
		loop {
			try {
				tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
				tr->setOption(FDBTransactionOptions::LOCK_AWARE);

				state Future<BackupConfig::RangeDispatchMapT::RangeResultType> bounds =
				    config.snapshotRangeDispatchMap().getRange(
				        tr, beginKey, keyAfter(allKeys.end), CLIENT_KNOBS->TOO_MANY);
				wait(success(bounds) && taskBucket->keepRunning(tr, task) &&
				     store(recentReadVersion, tr->getReadVersion()));

				if (!bounds.get().results.empty()) {
					dispatchBoundaries.reserve(dispatchBoundaries.size() + bounds.get().results.size());
					dispatchBoundaries.insert(
					    dispatchBoundaries.end(), bounds.get().results.begin(), bounds.get().results.end());
				}

				if (!bounds.get().more)
					break;

				beginKey = keyAfter(bounds.get().results.back().first);
				tr->reset();
			} catch (Error& e) {
				wait(tr->onError(e));
			}
		}

		// The next few sections involve combining the results above.  Yields are used after operations
		// that could have operated on many thousands of things and in loops which could have many
		// thousands of iterations.
		// Declare some common iterators which must be state vars and will be used multiple times.
		state int i;
		state RangeMap<Key, int, KeyRangeRef>::iterator iShard;
		state RangeMap<Key, int, KeyRangeRef>::iterator iShardEnd;

		// Set anything inside a dispatched range to DONE.
		// Also ensure that the boundary value are true, false, [true, false]...
		if (dispatchBoundaries.size() > 0) {
			state bool lastValue = false;
			state Key lastKey;
			for (i = 0; i < dispatchBoundaries.size(); ++i) {
				const std::pair<Key, bool>& boundary = dispatchBoundaries[i];

				// Values must alternate
				ASSERT(boundary.second == !lastValue);

				// If this was the end of a dispatched range
				if (!boundary.second) {
					// Ensure that the dispatched boundaries exist AND set all shard ranges in the dispatched range
					// to DONE.
					RangeMap<Key, int, KeyRangeRef>::Ranges shardRanges =
					    shardMap.modify(KeyRangeRef(lastKey, boundary.first));
					iShard = shardRanges.begin();
					iShardEnd = shardRanges.end();
					for (; iShard != iShardEnd; ++iShard) {
						iShard->value() = DONE;
						wait(yield());
					}
				}
				lastValue = dispatchBoundaries[i].second;
				lastKey = dispatchBoundaries[i].first;

				wait(yield());
			}
			ASSERT(lastValue == false);
		}

		// Set anything outside the backup ranges to SKIP.  We can use insert() here instead of modify()
		// because it's OK to delete shard boundaries in the skipped ranges.
		if (backupRanges.size() > 0) {
			shardMap.insert(KeyRangeRef(allKeys.begin, backupRanges.front().begin), SKIP);
			wait(yield());

			for (i = 0; i < backupRanges.size() - 1; ++i) {
				shardMap.insert(KeyRangeRef(backupRanges[i].end, backupRanges[i + 1].begin), SKIP);
				wait(yield());
			}

			shardMap.insert(KeyRangeRef(backupRanges.back().end, allKeys.end), SKIP);
			wait(yield());
		}

		state int countShardsDone = 0;
		state int countShardsNotDone = 0;

		// Scan through the shard map, counting the DONE and NOT_DONE shards.
		RangeMap<Key, int, KeyRangeRef>::Ranges shardRanges = shardMap.ranges();
		iShard = shardRanges.begin();
		iShardEnd = shardRanges.end();
		for (; iShard != iShardEnd; ++iShard) {
			if (iShard->value() == DONE) {
				++countShardsDone;
			} else if (iShard->value() >= NOT_DONE_MIN)
				++countShardsNotDone;

			wait(yield());
		}

		// Coalesce the shard map to make random selection below more efficient.
		shardMap.coalesce(allKeys);
		wait(yield());

		// In this context "all" refers to all of the shards relevant for this particular backup
		state int countAllShards = countShardsDone + countShardsNotDone;

		if (countShardsNotDone == 0) {
			TraceEvent("FileBackupSnapshotDispatchFinished")
			    .detail("BackupUID", config.getUid())
			    .detail("AllShards", countAllShards)
			    .detail("ShardsDone", countShardsDone)
			    .detail("ShardsNotDone", countShardsNotDone)
			    .detail("SnapshotBeginVersion", snapshotBeginVersion)
			    .detail("SnapshotTargetEndVersion", snapshotTargetEndVersion)
			    .detail("CurrentVersion", recentReadVersion)
			    .detail("SnapshotIntervalSeconds", snapshotIntervalSeconds);
			Params.snapshotFinished().set(task, true);
			return Void();
		}

		// Decide when the next snapshot dispatch should run.
		state Version nextDispatchVersion;

		// In simulation, use snapshot interval / 5 to ensure multiple dispatches run
		// Otherwise, use the knob for the number of seconds between snapshot dispatch tasks.
		if (g_network->isSimulated())
			nextDispatchVersion =
			    recentReadVersion + CLIENT_KNOBS->CORE_VERSIONSPERSECOND * (snapshotIntervalSeconds / 5.0);
		else
			nextDispatchVersion = recentReadVersion + CLIENT_KNOBS->CORE_VERSIONSPERSECOND *
			                                              CLIENT_KNOBS->BACKUP_SNAPSHOT_DISPATCH_INTERVAL_SEC;

		// If nextDispatchVersion is greater than snapshotTargetEndVersion (which could be in the past) then just
		// use the greater of recentReadVersion or snapshotTargetEndVersion.  Any range tasks created in this
		// dispatch will be scheduled at a random time between recentReadVersion and nextDispatchVersion, so
		// nextDispatchVersion shouldn't be less than recentReadVersion.
		if (nextDispatchVersion > snapshotTargetEndVersion)
			nextDispatchVersion = std::max(recentReadVersion, snapshotTargetEndVersion);

		Params.nextDispatchVersion().set(task, nextDispatchVersion);

		// Calculate number of shards that should be done before the next interval end
		// timeElapsed is between 0 and 1 and represents what portion of the shards we should have completed by now
		double timeElapsed;
		Version snapshotScheduledVersionInterval = snapshotTargetEndVersion - snapshotBeginVersion;
		if (snapshotTargetEndVersion > snapshotBeginVersion)
			timeElapsed = std::min(
			    1.0, (double)(nextDispatchVersion - snapshotBeginVersion) / (snapshotScheduledVersionInterval));
		else
			timeElapsed = 1.0;

		state int countExpectedShardsDone = countAllShards * timeElapsed;
		state int countShardsToDispatch = std::max<int>(0, countExpectedShardsDone - countShardsDone);

		// Calculate the number of shards that would have been dispatched by a normal (on-schedule)
		// BackupSnapshotDispatchTask given the dispatch window and the start and expected-end versions of the
		// current snapshot.
		int64_t dispatchWindow = nextDispatchVersion - recentReadVersion;

		// If the scheduled snapshot interval is 0 (such as for initial, as-fast-as-possible snapshot) then all
		// shards are considered late
		int countShardsExpectedPerNormalWindow;
		if (snapshotScheduledVersionInterval == 0) {
			countShardsExpectedPerNormalWindow = 0;
		} else {
			// A dispatchWindow of 0 means the target end version is <= now which also results in all shards being
			// considered late
			countShardsExpectedPerNormalWindow =
			    (double(dispatchWindow) / snapshotScheduledVersionInterval) * countAllShards;
		}

		// The number of shards 'behind' the snapshot is the count of how may additional shards beyond normal are
		// being dispatched, if any.
		int countShardsBehind =
		    std::max<int64_t>(0, countShardsToDispatch + snapshotBatchSize.get() - countShardsExpectedPerNormalWindow);
		Params.shardsBehind().set(task, countShardsBehind);

		TraceEvent("FileBackupSnapshotDispatchStats")
		    .detail("BackupUID", config.getUid())
		    .detail("AllShards", countAllShards)
		    .detail("ShardsDone", countShardsDone)
		    .detail("ShardsNotDone", countShardsNotDone)
		    .detail("ExpectedShardsDone", countExpectedShardsDone)
		    .detail("ShardsToDispatch", countShardsToDispatch)
		    .detail("ShardsBehind", countShardsBehind)
		    .detail("SnapshotBeginVersion", snapshotBeginVersion)
		    .detail("SnapshotTargetEndVersion", snapshotTargetEndVersion)
		    .detail("NextDispatchVersion", nextDispatchVersion)
		    .detail("CurrentVersion", recentReadVersion)
		    .detail("TimeElapsed", timeElapsed)
		    .detail("SnapshotIntervalSeconds", snapshotIntervalSeconds);

		// Dispatch random shards to catch up to the expected progress
		while (countShardsToDispatch > 0) {
			// First select ranges to add
			state std::vector<KeyRange> rangesToAdd;

			// Limit number of tasks added per transaction
			int taskBatchSize = BUGGIFY ? deterministicRandom()->randomInt(1, countShardsToDispatch + 1)
			                            : CLIENT_KNOBS->BACKUP_DISPATCH_ADDTASK_SIZE;
			int added = 0;

			while (countShardsToDispatch > 0 && added < taskBatchSize && shardMap.size() > 0) {
				// Get a random range.
				auto it = shardMap.randomRange();
				// Find a NOT_DONE range and add it to rangesToAdd
				while (1) {
					if (it->value() >= NOT_DONE_MIN) {
						rangesToAdd.push_back(it->range());
						it->value() = DONE;
						shardMap.coalesce(Key(it->begin()));
						++added;
						++countShardsDone;
						--countShardsToDispatch;
						--countShardsNotDone;
						break;
					}
					if (it->end() == shardMap.mapEnd)
						break;
					++it;
				}
			}

			state int64_t oldBatchSize = snapshotBatchSize.get();
			state int64_t newBatchSize = oldBatchSize + rangesToAdd.size();

			// Now add the selected ranges in a single transaction.
			tr->reset();
			loop {
				try {
					TraceEvent("FileBackupSnapshotDispatchAddingTasks")
					    .suppressFor(2)
					    .detail("TasksToAdd", rangesToAdd.size())
					    .detail("NewBatchSize", newBatchSize);

					tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
					tr->setOption(FDBTransactionOptions::LOCK_AWARE);

					// For each range, make sure it isn't set in the dispatched range map.
					state std::vector<Future<Optional<bool>>> beginReads;
					state std::vector<Future<Optional<bool>>> endReads;

					for (auto& range : rangesToAdd) {
						beginReads.push_back(config.snapshotRangeDispatchMap().get(tr, range.begin));
						endReads.push_back(config.snapshotRangeDispatchMap().get(tr, range.end));
					}

					wait(store(snapshotBatchSize.get(), config.snapshotBatchSize().getOrThrow(tr)) &&
					     waitForAll(beginReads) && waitForAll(endReads) && taskBucket->keepRunning(tr, task));

					// Snapshot batch size should be either oldBatchSize or newBatchSize. If new, this transaction
					// is already done.
					if (snapshotBatchSize.get() == newBatchSize) {
						break;
					} else {
						ASSERT(snapshotBatchSize.get() == oldBatchSize);
						config.snapshotBatchSize().set(tr, newBatchSize);
						snapshotBatchSize = newBatchSize;
						config.snapshotDispatchLastShardsBehind().set(tr, Params.shardsBehind().get(task));
						config.snapshotDispatchLastVersion().set(tr, tr->getReadVersion().get());
					}

					state std::vector<Future<Void>> addTaskFutures;

					for (i = 0; i < beginReads.size(); ++i) {
						KeyRange& range = rangesToAdd[i];

						// This loop might have made changes to begin or end boundaries in a prior
						// iteration.  If so, the updated values exist in the RYW cache so re-read both entries.
						Optional<bool> beginValue = config.snapshotRangeDispatchMap().get(tr, range.begin).get();
						Optional<bool> endValue = config.snapshotRangeDispatchMap().get(tr, range.end).get();

						ASSERT(!beginValue.present() || !endValue.present() || beginValue != endValue);

						// If begin is present, it must be a range end so value must be false
						// If end is present, it must be a range begin so value must be true
						if ((!beginValue.present() || !beginValue.get()) && (!endValue.present() || endValue.get())) {
							if (beginValue.present()) {
								config.snapshotRangeDispatchMap().erase(tr, range.begin);
							} else {
								config.snapshotRangeDispatchMap().set(tr, range.begin, true);
							}
							if (endValue.present()) {
								config.snapshotRangeDispatchMap().erase(tr, range.end);
							} else {
								config.snapshotRangeDispatchMap().set(tr, range.end, false);
							}

							Version scheduledVersion = invalidVersion;
							// If the next dispatch version is in the future, choose a random version at which to
							// start the new task.
							if (nextDispatchVersion > recentReadVersion)
								scheduledVersion = recentReadVersion + deterministicRandom()->random01() *
								                                           (nextDispatchVersion - recentReadVersion);

							// Range tasks during the initial snapshot should run at a higher priority
							int priority = latestSnapshotEndVersion.present() ? 0 : 1;
							addTaskFutures.push_back(
							    success(BackupRangeTaskFunc::addTask(tr,
							                                         taskBucket,
							                                         task,
							                                         priority,
							                                         range.begin,
							                                         range.end,
							                                         TaskCompletionKey::joinWith(snapshotBatchFuture),
							                                         Reference<TaskFuture>(),
							                                         scheduledVersion)));

							TraceEvent("FileBackupSnapshotRangeDispatched")
							    .suppressFor(2)
							    .detail("BackupUID", config.getUid())
							    .detail("CurrentVersion", recentReadVersion)
							    .detail("ScheduledVersion", scheduledVersion)
							    .detail("BeginKey", range.begin.printable())
							    .detail("EndKey", range.end.printable());
						} else {
							// This shouldn't happen because if the transaction was already done or if another
							// execution of this task is making progress it should have been detected above.
							ASSERT(false);
						}
					}

					wait(waitForAll(addTaskFutures));
					wait(tr->commit());
					break;
				} catch (Error& e) {
					wait(tr->onError(e));
				}
			}
		}

		if (countShardsNotDone == 0) {
			TraceEvent("FileBackupSnapshotDispatchFinished")
			    .detail("BackupUID", config.getUid())
			    .detail("AllShards", countAllShards)
			    .detail("ShardsDone", countShardsDone)
			    .detail("ShardsNotDone", countShardsNotDone)
			    .detail("SnapshotBeginVersion", snapshotBeginVersion)
			    .detail("SnapshotTargetEndVersion", snapshotTargetEndVersion)
			    .detail("CurrentVersion", recentReadVersion)
			    .detail("SnapshotIntervalSeconds", snapshotIntervalSeconds)
			    .detail("DispatchTimeSeconds", timer() - startTime);
			Params.snapshotFinished().set(task, true);
		}

		return Void();
	}

	// This function is just a wrapper for BackupSnapshotManifest::addTask() which is defined below.
	// The BackupSnapshotDispatchTask and BackupSnapshotManifest tasks reference each other so in order to keep
	// their execute and finish phases defined together inside their class definitions this wrapper is declared here
	// but defined after BackupSnapshotManifest is defined.
	static Future<Key> addSnapshotManifestTask(Reference<ReadYourWritesTransaction> tr,
	                                           Reference<TaskBucket> taskBucket,
	                                           Reference<Task> parentTask,
	                                           TaskCompletionKey completionKey,
	                                           Reference<TaskFuture> waitFor = Reference<TaskFuture>());

	ACTOR static Future<Void> _finish(Reference<ReadYourWritesTransaction> tr,
	                                  Reference<TaskBucket> taskBucket,
	                                  Reference<FutureBucket> futureBucket,
	                                  Reference<Task> task) {
		state BackupConfig config(task);

		// Get the batch future and dispatch done keys, then clear them.
		state Key snapshotBatchFutureKey;
		state Key snapshotBatchDispatchDoneKey;

		wait(store(snapshotBatchFutureKey, config.snapshotBatchFuture().getOrThrow(tr)) &&
		     store(snapshotBatchDispatchDoneKey, config.snapshotBatchDispatchDoneKey().getOrThrow(tr)));

		state Reference<TaskFuture> snapshotBatchFuture = futureBucket->unpack(snapshotBatchFutureKey);
		state Reference<TaskFuture> snapshotBatchDispatchDoneFuture =
		    futureBucket->unpack(snapshotBatchDispatchDoneKey);
		config.snapshotBatchFuture().clear(tr);
		config.snapshotBatchDispatchDoneKey().clear(tr);
		config.snapshotBatchSize().clear(tr);

		// Update shardsBehind here again in case the execute phase did not actually have to create any shard tasks
		config.snapshotDispatchLastShardsBehind().set(tr, Params.shardsBehind().getOrDefault(task, 0));
		config.snapshotDispatchLastVersion().set(tr, tr->getReadVersion().get());

		state Reference<TaskFuture> snapshotFinishedFuture = task->getDoneFuture(futureBucket);

		// If the snapshot is finished, the next task is to write a snapshot manifest, otherwise it's another
		// snapshot dispatch task. In either case, the task should wait for snapshotBatchFuture. The snapshot done
		// key, passed to the current task, is also passed on.
		if (Params.snapshotFinished().getOrDefault(task, false)) {
			wait(success(addSnapshotManifestTask(
			    tr, taskBucket, task, TaskCompletionKey::signal(snapshotFinishedFuture), snapshotBatchFuture)));
		} else {
			wait(success(addTask(tr,
			                     taskBucket,
			                     task,
			                     1,
			                     TaskCompletionKey::signal(snapshotFinishedFuture),
			                     snapshotBatchFuture,
			                     Params.nextDispatchVersion().get(task))));
		}

		// This snapshot batch is finished, so set the batch done future.
		wait(snapshotBatchDispatchDoneFuture->set(tr, taskBucket));

		wait(taskBucket->finish(tr, task));

		return Void();
	}
};
StringRef BackupSnapshotDispatchTask::name = "file_backup_dispatch_ranges_5.2"_sr;
REGISTER_TASKFUNC(BackupSnapshotDispatchTask);

struct BackupLogRangeTaskFunc : BackupTaskFuncBase {
	static StringRef name;
	static constexpr uint32_t version = 1;

	static struct {
		static TaskParam<bool> addBackupLogRangeTasks() { return __FUNCTION__sr; }
		static TaskParam<int64_t> fileSize() { return __FUNCTION__sr; }
		static TaskParam<Version> beginVersion() { return __FUNCTION__sr; }
		static TaskParam<Version> endVersion() { return __FUNCTION__sr; }
	} Params;

	StringRef getName() const override { return name; };

	Future<Void> execute(Database cx,
	                     Reference<TaskBucket> tb,
	                     Reference<FutureBucket> fb,
	                     Reference<Task> task) override {
		return _execute(cx, tb, fb, task);
	};
	Future<Void> finish(Reference<ReadYourWritesTransaction> tr,
	                    Reference<TaskBucket> tb,
	                    Reference<FutureBucket> fb,
	                    Reference<Task> task) override {
		return _finish(tr, tb, fb, task);
	};

	ACTOR static Future<Void> _execute(Database cx,
	                                   Reference<TaskBucket> taskBucket,
	                                   Reference<FutureBucket> futureBucket,
	                                   Reference<Task> task) {
		state Reference<FlowLock> lock(new FlowLock(CLIENT_KNOBS->BACKUP_LOCK_BYTES));

		wait(checkTaskVersion(cx, task, BackupLogRangeTaskFunc::name, BackupLogRangeTaskFunc::version));

		state Version beginVersion = Params.beginVersion().get(task);
		state Version endVersion = Params.endVersion().get(task);

		state BackupConfig config(task);
		state Reference<IBackupContainer> bc;

		state Reference<ReadYourWritesTransaction> tr(new ReadYourWritesTransaction(cx));
		loop {
			tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
			tr->setOption(FDBTransactionOptions::LOCK_AWARE);
			// Wait for the read version to pass endVersion
			try {
				wait(taskBucket->keepRunning(tr, task));

				if (!bc) {
					// Backup container must be present if we're still here
					Reference<IBackupContainer> _bc = wait(config.backupContainer().getOrThrow(tr));
					bc = getBackupContainerWithProxy(_bc);
				}

				Version currentVersion = tr->getReadVersion().get();
				if (endVersion < currentVersion)
					break;

				wait(delay(std::max(CLIENT_KNOBS->BACKUP_RANGE_MINWAIT,
				                    (double)(endVersion - currentVersion) / CLIENT_KNOBS->CORE_VERSIONSPERSECOND)));
				tr->reset();
			} catch (Error& e) {
				wait(tr->onError(e));
			}
		}

		Key destUidValue = wait(config.destUidValue().getOrThrow(tr));

		// Get the set of key ranges that hold mutations for (beginVersion, endVersion).  They will be queried in
		// parallel below and there is a limit on how many we want to process in a single BackupLogRangeTask so if
		// that limit is exceeded then set the addBackupLogRangeTasks boolean in Params and stop, signalling the
		// finish() step to break up the (beginVersion, endVersion) range into smaller intervals which are then
		// processed by individual BackupLogRangeTasks.
		state Standalone<VectorRef<KeyRangeRef>> ranges = getLogRanges(beginVersion, endVersion, destUidValue);
		if (ranges.size() > CLIENT_KNOBS->BACKUP_MAX_LOG_RANGES) {
			Params.addBackupLogRangeTasks().set(task, true);
			return Void();
		}

		// Block size must be at least large enough for 1 max size key, 1 max size value, and overhead, so
		// conservatively 125k.
		state int blockSize =
		    BUGGIFY ? deterministicRandom()->randomInt(125e3, 4e6) : CLIENT_KNOBS->BACKUP_LOGFILE_BLOCK_SIZE;
		state Reference<IBackupFile> outFile = wait(bc->writeLogFile(beginVersion, endVersion, blockSize));
		state LogFileWriter logFile(outFile, blockSize);

		// Query all key ranges covering (beginVersion, endVersion) in parallel, writing their results to the
		// results promise stream as they are received.  Note that this means the records read from the results
		// stream are not likely to be in increasing Version order.
		state PromiseStream<RangeResultWithVersion> results;
		state std::vector<Future<Void>> rc;

		for (auto& range : ranges) {
			rc.push_back(
			    readCommitted(cx, results, lock, range, Terminator::False, AccessSystemKeys::True, LockAware::True));
		}

		state Future<Void> sendEOS = map(errorOr(waitForAll(rc)), [=](ErrorOr<Void> const& result) mutable {
			if (result.isError())
				results.sendError(result.getError());
			else
				results.sendError(end_of_stream());
			return Void();
		});

		state Version lastVersion;
		try {
			loop {
				state RangeResultWithVersion r = waitNext(results.getFuture());
				lock->release(r.first.expectedSize());

				state int i = 0;
				for (; i < r.first.size(); ++i) {
					// Remove the backupLogPrefix + UID bytes from the key
					wait(logFile.writeKV(r.first[i].key.substr(backupLogPrefixBytes + 16), r.first[i].value));
					lastVersion = r.second;
				}
			}
		} catch (Error& e) {
			if (e.code() == error_code_actor_cancelled)
				throw;

			if (e.code() != error_code_end_of_stream) {
				state Error err = e;
				wait(config.logError(cx, err, format("Failed to write to file `%s'", outFile->getFileName().c_str())));
				throw err;
			}
		}

		// Make sure this task is still alive, if it's not then the data read above could be incomplete.
		wait(taskBucket->keepRunning(cx, task));

		wait(outFile->finish());

		TraceEvent("FileBackupWroteLogFile")
		    .suppressFor(60)
		    .detail("BackupUID", config.getUid())
		    .detail("Size", outFile->size())
		    .detail("BeginVersion", beginVersion)
		    .detail("EndVersion", endVersion)
		    .detail("LastReadVersion", lastVersion);

		Params.fileSize().set(task, outFile->size());

		return Void();
	}

	ACTOR static Future<Key> addTask(Reference<ReadYourWritesTransaction> tr,
	                                 Reference<TaskBucket> taskBucket,
	                                 Reference<Task> parentTask,
	                                 int priority,
	                                 Version beginVersion,
	                                 Version endVersion,
	                                 TaskCompletionKey completionKey,
	                                 Reference<TaskFuture> waitFor = Reference<TaskFuture>()) {
		Key key = wait(addBackupTask(
		    BackupLogRangeTaskFunc::name,
		    BackupLogRangeTaskFunc::version,
		    tr,
		    taskBucket,
		    completionKey,
		    BackupConfig(parentTask),
		    waitFor,
		    [=](Reference<Task> task) {
			    Params.beginVersion().set(task, beginVersion);
			    Params.endVersion().set(task, endVersion);
			    Params.addBackupLogRangeTasks().set(task, false);
		    },
		    priority));
		return key;
	}

	ACTOR static Future<Void> startBackupLogRangeInternal(Reference<ReadYourWritesTransaction> tr,
	                                                      Reference<TaskBucket> taskBucket,
	                                                      Reference<FutureBucket> futureBucket,
	                                                      Reference<Task> task,
	                                                      Reference<TaskFuture> taskFuture,
	                                                      Version beginVersion,
	                                                      Version endVersion) {
		tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
		tr->setOption(FDBTransactionOptions::LOCK_AWARE);

		std::vector<Future<Key>> addTaskVector;
		int tasks = 0;
		for (int64_t vblock = beginVersion / CLIENT_KNOBS->LOG_RANGE_BLOCK_SIZE;
		     vblock < (endVersion + CLIENT_KNOBS->LOG_RANGE_BLOCK_SIZE - 1) / CLIENT_KNOBS->LOG_RANGE_BLOCK_SIZE;
		     vblock += CLIENT_KNOBS->BACKUP_MAX_LOG_RANGES) {
			Version bv = std::max(beginVersion, vblock * CLIENT_KNOBS->LOG_RANGE_BLOCK_SIZE);

			if (tasks >= CLIENT_KNOBS->BACKUP_SHARD_TASK_LIMIT) {
				addTaskVector.push_back(addTask(tr,
				                                taskBucket,
				                                task,
				                                task->getPriority(),
				                                bv,
				                                endVersion,
				                                TaskCompletionKey::joinWith(taskFuture)));
				break;
			}

			Version ev = std::min(endVersion,
			                      (vblock + CLIENT_KNOBS->BACKUP_MAX_LOG_RANGES) * CLIENT_KNOBS->LOG_RANGE_BLOCK_SIZE);
			addTaskVector.push_back(
			    addTask(tr, taskBucket, task, task->getPriority(), bv, ev, TaskCompletionKey::joinWith(taskFuture)));
			tasks++;
		}

		wait(waitForAll(addTaskVector));

		return Void();
	}

	ACTOR static Future<Void> _finish(Reference<ReadYourWritesTransaction> tr,
	                                  Reference<TaskBucket> taskBucket,
	                                  Reference<FutureBucket> futureBucket,
	                                  Reference<Task> task) {
		state Version beginVersion = Params.beginVersion().get(task);
		state Version endVersion = Params.endVersion().get(task);
		state Reference<TaskFuture> taskFuture = futureBucket->unpack(task->params[Task::reservedTaskParamKeyDone]);
		state BackupConfig config(task);

		if (Params.fileSize().exists(task)) {
			config.logBytesWritten().atomicOp(tr, Params.fileSize().get(task), MutationRef::AddValue);
		}

		if (Params.addBackupLogRangeTasks().get(task)) {
			wait(startBackupLogRangeInternal(tr, taskBucket, futureBucket, task, taskFuture, beginVersion, endVersion));
		} else {
			wait(taskFuture->set(tr, taskBucket));
		}

		wait(taskBucket->finish(tr, task));
		return Void();
	}
};

StringRef BackupLogRangeTaskFunc::name = "file_backup_write_logs_5.2"_sr;
REGISTER_TASKFUNC(BackupLogRangeTaskFunc);

// This task stopped being used in 6.2, however the code remains here to handle upgrades.
struct EraseLogRangeTaskFunc : BackupTaskFuncBase {
	static StringRef name;
	static constexpr uint32_t version = 1;
	StringRef getName() const override { return name; };

	static struct {
		static TaskParam<Version> beginVersion() { return __FUNCTION__sr; }
		static TaskParam<Version> endVersion() { return __FUNCTION__sr; }
		static TaskParam<Key> destUidValue() { return __FUNCTION__sr; }
	} Params;

	ACTOR static Future<Key> addTask(Reference<ReadYourWritesTransaction> tr,
	                                 Reference<TaskBucket> taskBucket,
	                                 UID logUid,
	                                 TaskCompletionKey completionKey,
	                                 Key destUidValue,
	                                 Version endVersion = 0,
	                                 Reference<TaskFuture> waitFor = Reference<TaskFuture>()) {
		Key key = wait(addBackupTask(
		    EraseLogRangeTaskFunc::name,
		    EraseLogRangeTaskFunc::version,
		    tr,
		    taskBucket,
		    completionKey,
		    BackupConfig(logUid),
		    waitFor,
		    [=](Reference<Task> task) {
			    Params.beginVersion().set(task,
			                              1); // FIXME: remove in 6.X, only needed for 5.2 backward compatibility
			    Params.endVersion().set(task, endVersion);
			    Params.destUidValue().set(task, destUidValue);
		    },
		    0,
		    SetValidation::False));

		return key;
	}

	ACTOR static Future<Void> _finish(Reference<ReadYourWritesTransaction> tr,
	                                  Reference<TaskBucket> taskBucket,
	                                  Reference<FutureBucket> futureBucket,
	                                  Reference<Task> task) {
		state Reference<TaskFuture> taskFuture = futureBucket->unpack(task->params[Task::reservedTaskParamKeyDone]);

		wait(checkTaskVersion(tr->getDatabase(), task, EraseLogRangeTaskFunc::name, EraseLogRangeTaskFunc::version));

		state Version endVersion = Params.endVersion().get(task);
		state Key destUidValue = Params.destUidValue().get(task);

		state BackupConfig config(task);
		state Key logUidValue = config.getUidAsKey();

		wait(taskFuture->set(tr, taskBucket) && taskBucket->finish(tr, task) &&
		     eraseLogData(
		         tr, logUidValue, destUidValue, endVersion != 0 ? Optional<Version>(endVersion) : Optional<Version>()));

		return Void();
	}

	Future<Void> execute(Database cx,
	                     Reference<TaskBucket> tb,
	                     Reference<FutureBucket> fb,
	                     Reference<Task> task) override {
		return Void();
	};
	Future<Void> finish(Reference<ReadYourWritesTransaction> tr,
	                    Reference<TaskBucket> tb,
	                    Reference<FutureBucket> fb,
	                    Reference<Task> task) override {
		return _finish(tr, tb, fb, task);
	};
};
StringRef EraseLogRangeTaskFunc::name = "file_backup_erase_logs_5.2"_sr;
REGISTER_TASKFUNC(EraseLogRangeTaskFunc);

struct BackupLogsDispatchTask : BackupTaskFuncBase {
	static StringRef name;
	static constexpr uint32_t version = 1;

	static struct {
		static TaskParam<Version> prevBeginVersion() { return __FUNCTION__sr; }
		static TaskParam<Version> beginVersion() { return __FUNCTION__sr; }
	} Params;

	ACTOR static Future<Void> _finish(Reference<ReadYourWritesTransaction> tr,
	                                  Reference<TaskBucket> taskBucket,
	                                  Reference<FutureBucket> futureBucket,
	                                  Reference<Task> task) {
		wait(checkTaskVersion(tr->getDatabase(), task, BackupLogsDispatchTask::name, BackupLogsDispatchTask::version));

		tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
		tr->setOption(FDBTransactionOptions::LOCK_AWARE);

		state Reference<TaskFuture> onDone = task->getDoneFuture(futureBucket);
		state Version prevBeginVersion = Params.prevBeginVersion().get(task);
		state Version beginVersion = Params.beginVersion().get(task);
		state BackupConfig config(task);
		config.latestLogEndVersion().set(tr, beginVersion);

		state bool stopWhenDone;
		state Optional<Version> restorableVersion;
		state EBackupState backupState;
		state Optional<std::string> tag;
		state Optional<Version> latestSnapshotEndVersion;
		state Optional<bool> partitionedLog;

		wait(store(stopWhenDone, config.stopWhenDone().getOrThrow(tr)) &&
		     store(restorableVersion, config.getLatestRestorableVersion(tr)) &&
		     store(backupState, config.stateEnum().getOrThrow(tr)) && store(tag, config.tag().get(tr)) &&
		     store(latestSnapshotEndVersion, config.latestSnapshotEndVersion().get(tr)) &&
		     store(partitionedLog, config.partitionedLogEnabled().get(tr)));

		// If restorable, update the last restorable version for this tag
		if (restorableVersion.present() && tag.present()) {
			FileBackupAgent().setLastRestorable(tr, StringRef(tag.get()), restorableVersion.get());
		}

		// If the backup is restorable but the state is not differential then set state to differential
		if (restorableVersion.present() && backupState != EBackupState::STATE_RUNNING_DIFFERENTIAL)
			config.stateEnum().set(tr, EBackupState::STATE_RUNNING_DIFFERENTIAL);

		// If stopWhenDone is set and there is a restorable version, set the done future and do not create further
		// tasks.
		if (stopWhenDone && restorableVersion.present()) {
			wait(onDone->set(tr, taskBucket) && taskBucket->finish(tr, task));

			TraceEvent("FileBackupLogsDispatchDone")
			    .detail("BackupUID", config.getUid())
			    .detail("BeginVersion", beginVersion)
			    .detail("RestorableVersion", restorableVersion.orDefault(-1));

			return Void();
		}

		state Version endVersion = std::max<Version>(tr->getReadVersion().get() + 1,
		                                             beginVersion + (CLIENT_KNOBS->BACKUP_MAX_LOG_RANGES - 1) *
		                                                                CLIENT_KNOBS->LOG_RANGE_BLOCK_SIZE);

		TraceEvent("FileBackupLogDispatch")
		    .suppressFor(60)
		    .detail("BeginVersion", beginVersion)
		    .detail("EndVersion", endVersion)
		    .detail("RestorableVersion", restorableVersion.orDefault(-1));

		state Reference<TaskFuture> logDispatchBatchFuture = futureBucket->future(tr);

		// If a snapshot has ended for this backup then mutations are higher priority to reduce backup lag
		state int priority = latestSnapshotEndVersion.present() ? 1 : 0;

		if (!partitionedLog.present() || !partitionedLog.get()) {
			// Add the initial log range task to read/copy the mutations and the next logs dispatch task which will
			// run after this batch is done
			// read blog/ prefix and write those (param1, param2) into files
			wait(success(BackupLogRangeTaskFunc::addTask(tr,
			                                             taskBucket,
			                                             task,
			                                             priority,
			                                             beginVersion,
			                                             endVersion,
			                                             TaskCompletionKey::joinWith(logDispatchBatchFuture))));
			// issue the next key range
			wait(success(BackupLogsDispatchTask::addTask(tr,
			                                             taskBucket,
			                                             task,
			                                             priority,
			                                             beginVersion,
			                                             endVersion,
			                                             TaskCompletionKey::signal(onDone),
			                                             logDispatchBatchFuture)));

			// Do not erase at the first time
			if (prevBeginVersion > 0) {
				state Key destUidValue = wait(config.destUidValue().getOrThrow(tr));
				wait(eraseLogData(tr, config.getUidAsKey(), destUidValue, Optional<Version>(beginVersion)));
			}
		} else {
			// Skip mutation copy and erase backup mutations. Just check back periodically.
			Version scheduledVersion = tr->getReadVersion().get() +
			                           CLIENT_KNOBS->BACKUP_POLL_PROGRESS_SECONDS * CLIENT_KNOBS->VERSIONS_PER_SECOND;
			wait(success(BackupLogsDispatchTask::addTask(tr,
			                                             taskBucket,
			                                             task,
			                                             1,
			                                             beginVersion,
			                                             endVersion,
			                                             TaskCompletionKey::signal(onDone),
			                                             Reference<TaskFuture>(),
			                                             scheduledVersion)));
		}

		wait(taskBucket->finish(tr, task));

		TraceEvent("FileBackupLogsDispatchContinuing")
		    .suppressFor(60)
		    .detail("BackupUID", config.getUid())
		    .detail("BeginVersion", beginVersion)
		    .detail("EndVersion", endVersion);

		return Void();
	}

	ACTOR static Future<Key> addTask(Reference<ReadYourWritesTransaction> tr,
	                                 Reference<TaskBucket> taskBucket,
	                                 Reference<Task> parentTask,
	                                 int priority,
	                                 Version prevBeginVersion,
	                                 Version beginVersion,
	                                 TaskCompletionKey completionKey,
	                                 Reference<TaskFuture> waitFor = Reference<TaskFuture>(),
	                                 Version scheduledVersion = invalidVersion) {
		Key key = wait(addBackupTask(
		    BackupLogsDispatchTask::name,
		    BackupLogsDispatchTask::version,
		    tr,
		    taskBucket,
		    completionKey,
		    BackupConfig(parentTask),
		    waitFor,
		    [=](Reference<Task> task) {
			    Params.prevBeginVersion().set(task, prevBeginVersion);
			    Params.beginVersion().set(task, beginVersion);
			    if (scheduledVersion != invalidVersion) {
				    ReservedTaskParams::scheduledVersion().set(task, scheduledVersion);
			    }
		    },
		    priority));
		return key;
	}

	StringRef getName() const override { return name; };

	Future<Void> execute(Database cx,
	                     Reference<TaskBucket> tb,
	                     Reference<FutureBucket> fb,
	                     Reference<Task> task) override {
		return Void();
	};
	Future<Void> finish(Reference<ReadYourWritesTransaction> tr,
	                    Reference<TaskBucket> tb,
	                    Reference<FutureBucket> fb,
	                    Reference<Task> task) override {
		return _finish(tr, tb, fb, task);
	};
};
StringRef BackupLogsDispatchTask::name = "file_backup_dispatch_logs_5.2"_sr;
REGISTER_TASKFUNC(BackupLogsDispatchTask);

struct FileBackupFinishedTask : BackupTaskFuncBase {
	static StringRef name;
	static constexpr uint32_t version = 1;

	StringRef getName() const override { return name; };

	ACTOR static Future<Void> _finish(Reference<ReadYourWritesTransaction> tr,
	                                  Reference<TaskBucket> taskBucket,
	                                  Reference<FutureBucket> futureBucket,
	                                  Reference<Task> task) {
		wait(checkTaskVersion(tr->getDatabase(), task, FileBackupFinishedTask::name, FileBackupFinishedTask::version));

		state BackupConfig backup(task);
		state UID uid = backup.getUid();

		tr->setOption(FDBTransactionOptions::COMMIT_ON_FIRST_PROXY);
		state Key destUidValue = wait(backup.destUidValue().getOrThrow(tr));

		wait(eraseLogData(tr, backup.getUidAsKey(), destUidValue) && clearBackupStartID(tr, uid));

		backup.stateEnum().set(tr, EBackupState::STATE_COMPLETED);

		wait(taskBucket->finish(tr, task));

		TraceEvent("FileBackupFinished").detail("BackupUID", uid);

		return Void();
	}

	ACTOR static Future<Key> addTask(Reference<ReadYourWritesTransaction> tr,
	                                 Reference<TaskBucket> taskBucket,
	                                 Reference<Task> parentTask,
	                                 TaskCompletionKey completionKey,
	                                 Reference<TaskFuture> waitFor = Reference<TaskFuture>()) {
		Key key = wait(addBackupTask(FileBackupFinishedTask::name,
		                             FileBackupFinishedTask::version,
		                             tr,
		                             taskBucket,
		                             completionKey,
		                             BackupConfig(parentTask),
		                             waitFor));
		return key;
	}

	Future<Void> execute(Database cx,
	                     Reference<TaskBucket> tb,
	                     Reference<FutureBucket> fb,
	                     Reference<Task> task) override {
		return Void();
	};
	Future<Void> finish(Reference<ReadYourWritesTransaction> tr,
	                    Reference<TaskBucket> tb,
	                    Reference<FutureBucket> fb,
	                    Reference<Task> task) override {
		return _finish(tr, tb, fb, task);
	};
};
StringRef FileBackupFinishedTask::name = "file_backup_finished_5.2"_sr;
REGISTER_TASKFUNC(FileBackupFinishedTask);

struct BackupSnapshotManifest : BackupTaskFuncBase {
	static StringRef name;
	static constexpr uint32_t version = 1;
	static struct {
		static TaskParam<Version> endVersion() { return __FUNCTION__sr; }
	} Params;

	ACTOR static Future<Void> _execute(Database cx,
	                                   Reference<TaskBucket> taskBucket,
	                                   Reference<FutureBucket> futureBucket,
	                                   Reference<Task> task) {
		state BackupConfig config(task);
		state Reference<IBackupContainer> bc;
		state DatabaseConfiguration dbConfig;

		state Reference<ReadYourWritesTransaction> tr(new ReadYourWritesTransaction(cx));

		// Read the entire range file map into memory, then walk it backwards from its last entry to produce a list
		// of non overlapping key range files
		state std::map<Key, BackupConfig::RangeSlice> localmap;
		state Key startKey;
		state int batchSize = BUGGIFY ? 1 : 1000000;

		loop {
			try {
				tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
				tr->setOption(FDBTransactionOptions::LOCK_AWARE);

				wait(taskBucket->keepRunning(tr, task));
				wait(store(dbConfig, getDatabaseConfiguration(cx)));

				if (!bc) {
					// Backup container must be present if we're still here
					Reference<IBackupContainer> _bc = wait(config.backupContainer().getOrThrow(tr));
					bc = getBackupContainerWithProxy(_bc);
				}

				BackupConfig::RangeFileMapT::RangeResultType rangeresults =
				    wait(config.snapshotRangeFileMap().getRange(tr, startKey, {}, batchSize));

				for (auto& p : rangeresults.results) {
					localmap.insert(p);
				}

				if (!rangeresults.more)
					break;

				startKey = keyAfter(rangeresults.results.back().first);
				tr->reset();
			} catch (Error& e) {
				wait(tr->onError(e));
			}
		}

		state std::vector<std::string> files;
		state std::vector<std::pair<Key, Key>> beginEndKeys;
		state Version maxVer = 0;
		state Version minVer = std::numeric_limits<Version>::max();
		state int64_t totalBytes = 0;

		if (!localmap.empty()) {
			// Get iterator that points to greatest key, start there.
			auto ri = localmap.rbegin();
			auto i = (++ri).base();

			while (1) {
				const BackupConfig::RangeSlice& r = i->second;

				// Add file to final file list
				files.push_back(r.fileName);

				// Add (beginKey, endKey) pairs to the list
				beginEndKeys.emplace_back(i->second.begin, i->first);

				// Update version range seen
				if (r.version < minVer)
					minVer = r.version;
				if (r.version > maxVer)
					maxVer = r.version;

				// Update total bytes counted.
				totalBytes += r.fileSize;

				// Jump to file that either ends where this file begins or has the greatest end that is less than
				// the begin of this file.  In other words find the map key that is <= begin of this file.  To do
				// this find the first end strictly greater than begin and then back up one.
				i = localmap.upper_bound(i->second.begin);
				// If we get begin then we're done, there are no more ranges that end at or before the last file's
				// begin
				if (i == localmap.begin())
					break;
				--i;
			}
		}

		Params.endVersion().set(task, maxVer);

		// Avoid keyRange filtering optimization for 'manifest' files
		wait(bc->writeKeyspaceSnapshotFile(files,
		                                   beginEndKeys,
		                                   totalBytes,
		                                   dbConfig.encryptionAtRestMode.isEncryptionEnabled()
		                                       ? IncludeKeyRangeMap::False
		                                       : IncludeKeyRangeMap::True));

		TraceEvent(SevInfo, "FileBackupWroteSnapshotManifest")
		    .detail("BackupUID", config.getUid())
		    .detail("BeginVersion", minVer)
		    .detail("EndVersion", maxVer)
		    .detail("TotalBytes", totalBytes);

		return Void();
	}

	ACTOR static Future<Void> _finish(Reference<ReadYourWritesTransaction> tr,
	                                  Reference<TaskBucket> taskBucket,
	                                  Reference<FutureBucket> futureBucket,
	                                  Reference<Task> task) {
		wait(checkTaskVersion(tr->getDatabase(), task, BackupSnapshotManifest::name, BackupSnapshotManifest::version));

		state BackupConfig config(task);

		// Set the latest snapshot end version, which was set during the execute phase
		config.latestSnapshotEndVersion().set(tr, Params.endVersion().get(task));

		state bool stopWhenDone;
		state EBackupState backupState;
		state Optional<Version> restorableVersion;
		state Optional<Version> firstSnapshotEndVersion;
		state Optional<std::string> tag;

		wait(store(stopWhenDone, config.stopWhenDone().getOrThrow(tr)) &&
		     store(backupState, config.stateEnum().getOrThrow(tr)) &&
		     store(restorableVersion, config.getLatestRestorableVersion(tr)) &&
		     store(firstSnapshotEndVersion, config.firstSnapshotEndVersion().get(tr)) &&
		     store(tag, config.tag().get(tr)));

		// If restorable, update the last restorable version for this tag
		if (restorableVersion.present() && tag.present()) {
			FileBackupAgent().setLastRestorable(tr, StringRef(tag.get()), restorableVersion.get());
		}

		if (!firstSnapshotEndVersion.present()) {
			config.firstSnapshotEndVersion().set(tr, Params.endVersion().get(task));
		}

		// If the backup is restorable and the state isn't differential the set state to differential
		if (restorableVersion.present() && backupState != EBackupState::STATE_RUNNING_DIFFERENTIAL)
			config.stateEnum().set(tr, EBackupState::STATE_RUNNING_DIFFERENTIAL);

		// Unless we are to stop, start the next snapshot using the default interval
		Reference<TaskFuture> snapshotDoneFuture = task->getDoneFuture(futureBucket);
		if (!stopWhenDone) {
			wait(config.initNewSnapshot(tr) &&
			     success(BackupSnapshotDispatchTask::addTask(
			         tr, taskBucket, task, 1, TaskCompletionKey::signal(snapshotDoneFuture))));
		} else {
			// Set the done future as the snapshot is now complete.
			wait(snapshotDoneFuture->set(tr, taskBucket));
		}

		wait(taskBucket->finish(tr, task));
		return Void();
	}

	ACTOR static Future<Key> addTask(Reference<ReadYourWritesTransaction> tr,
	                                 Reference<TaskBucket> taskBucket,
	                                 Reference<Task> parentTask,
	                                 TaskCompletionKey completionKey,
	                                 Reference<TaskFuture> waitFor = Reference<TaskFuture>()) {
		Key key = wait(addBackupTask(BackupSnapshotManifest::name,
		                             BackupSnapshotManifest::version,
		                             tr,
		                             taskBucket,
		                             completionKey,
		                             BackupConfig(parentTask),
		                             waitFor,
		                             NOP_SETUP_TASK_FN,
		                             1));
		return key;
	}

	StringRef getName() const override { return name; };

	Future<Void> execute(Database cx,
	                     Reference<TaskBucket> tb,
	                     Reference<FutureBucket> fb,
	                     Reference<Task> task) override {
		return _execute(cx, tb, fb, task);
	};
	Future<Void> finish(Reference<ReadYourWritesTransaction> tr,
	                    Reference<TaskBucket> tb,
	                    Reference<FutureBucket> fb,
	                    Reference<Task> task) override {
		return _finish(tr, tb, fb, task);
	};
};
StringRef BackupSnapshotManifest::name = "file_backup_write_snapshot_manifest_5.2"_sr;
REGISTER_TASKFUNC(BackupSnapshotManifest);

Future<Key> BackupSnapshotDispatchTask::addSnapshotManifestTask(Reference<ReadYourWritesTransaction> tr,
                                                                Reference<TaskBucket> taskBucket,
                                                                Reference<Task> parentTask,
                                                                TaskCompletionKey completionKey,
                                                                Reference<TaskFuture> waitFor) {
	return BackupSnapshotManifest::addTask(tr, taskBucket, parentTask, completionKey, waitFor);
}

struct StartFullBackupTaskFunc : BackupTaskFuncBase {
	static StringRef name;
	static constexpr uint32_t version = 1;

	static struct {
		static TaskParam<Version> beginVersion() { return __FUNCTION__sr; }
	} Params;

	ACTOR static Future<Void> _execute(Database cx,
	                                   Reference<TaskBucket> taskBucket,
	                                   Reference<FutureBucket> futureBucket,
	                                   Reference<Task> task) {
		wait(checkTaskVersion(cx, task, StartFullBackupTaskFunc::name, StartFullBackupTaskFunc::version));

		state Reference<ReadYourWritesTransaction> tr(new ReadYourWritesTransaction(cx));
		state BackupConfig config(task);
		state Future<Optional<bool>> partitionedLog;
		loop {
			try {
				tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
				tr->setOption(FDBTransactionOptions::LOCK_AWARE);
				partitionedLog = config.partitionedLogEnabled().get(tr);
				wait(success(partitionedLog));
				break;
			} catch (Error& e) {
				wait(tr->onError(e));
			}
		}

		// Check if backup worker is enabled
		DatabaseConfiguration dbConfig = wait(getDatabaseConfiguration(cx));
		state bool backupWorkerEnabled = dbConfig.backupWorkerEnabled;
		if (!backupWorkerEnabled && partitionedLog.get().present() && partitionedLog.get().get()) {
			// Change configuration only when we set to use partitioned logs and
			// the flag was not set before.
			wait(success(ManagementAPI::changeConfig(cx.getReference(), "backup_worker_enabled:=1", true)));
			backupWorkerEnabled = true;
			// the user is responsible for manually disabling backup worker after the last backup is done
		}

		// Get start version after backup worker are enabled
		loop {
			try {
				tr->reset();
				tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
				tr->setOption(FDBTransactionOptions::LOCK_AWARE);

				state Future<Version> startVersionFuture = tr->getReadVersion();
				wait(success(startVersionFuture));
				Params.beginVersion().set(task, startVersionFuture.get());
				break;
			} catch (Error& e) {
				wait(tr->onError(e));
			}
		}

		// Set the "backupStartedKey" and wait for all backup worker started
		tr->reset();
		loop {
			state Future<Void> watchFuture;
			try {
				tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
				tr->setOption(FDBTransactionOptions::LOCK_AWARE);
				state Future<Void> keepRunning = taskBucket->keepRunning(tr, task);

				state Future<Optional<Value>> started = tr->get(backupStartedKey);
				state Future<Optional<Value>> taskStarted = tr->get(config.allWorkerStarted().key);
				partitionedLog = config.partitionedLogEnabled().get(tr);
				wait(success(started) && success(taskStarted) && success(partitionedLog));

				if (!partitionedLog.get().present() || !partitionedLog.get().get()) {
					return Void(); // Skip if not using partitioned logs
				}

				std::vector<std::pair<UID, Version>> ids;
				if (started.get().present()) {
					ids = decodeBackupStartedValue(started.get().get());
				}
				const UID uid = config.getUid();
				auto it = std::find_if(
				    ids.begin(), ids.end(), [uid](const std::pair<UID, Version>& p) { return p.first == uid; });
				if (it == ids.end()) {
					ids.emplace_back(uid, Params.beginVersion().get(task));
				} else {
					Params.beginVersion().set(task, it->second);
				}

				tr->set(backupStartedKey, encodeBackupStartedValue(ids));

				// The task may be restarted. Set the watch if started key has NOT been set.
				if (!taskStarted.get().present()) {
					watchFuture = tr->watch(config.allWorkerStarted().key);
				}

				wait(keepRunning);
				wait(tr->commit());
				if (!taskStarted.get().present()) {
					wait(watchFuture);
				}
				return Void();
			} catch (Error& e) {
				wait(tr->onError(e));
			}
		}
	}

	ACTOR static Future<Void> _finish(Reference<ReadYourWritesTransaction> tr,
	                                  Reference<TaskBucket> taskBucket,
	                                  Reference<FutureBucket> futureBucket,
	                                  Reference<Task> task) {
		state BackupConfig config(task);
		state Version beginVersion = Params.beginVersion().get(task);

		state Future<std::vector<KeyRange>> backupRangesFuture = config.backupRanges().getOrThrow(tr);
		state Future<Key> destUidValueFuture = config.destUidValue().getOrThrow(tr);
		state Future<Optional<bool>> partitionedLog = config.partitionedLogEnabled().get(tr);
		state Future<Optional<bool>> incrementalBackupOnly = config.incrementalBackupOnly().get(tr);
		wait(success(backupRangesFuture) && success(destUidValueFuture) && success(partitionedLog) &&
		     success(incrementalBackupOnly));
		std::vector<KeyRange> backupRanges = backupRangesFuture.get();
		Key destUidValue = destUidValueFuture.get();

		// Start logging the mutations for the specified ranges of the tag if needed
		if (!partitionedLog.get().present() || !partitionedLog.get().get()) {
			for (auto& backupRange : backupRanges) {
				config.startMutationLogs(tr, backupRange, destUidValue);
			}
		}

		config.stateEnum().set(tr, EBackupState::STATE_RUNNING);

		state Reference<TaskFuture> backupFinished = futureBucket->future(tr);

		// Initialize the initial snapshot and create tasks to continually write logs and snapshots.
		state Optional<int64_t> initialSnapshotIntervalSeconds = wait(config.initialSnapshotIntervalSeconds().get(tr));
		wait(config.initNewSnapshot(tr, initialSnapshotIntervalSeconds.orDefault(0)));

		// Using priority 1 for both of these to at least start both tasks soon
		// Do not add snapshot task if we only want the incremental backup
		if (!incrementalBackupOnly.get().present() || !incrementalBackupOnly.get().get()) {
			wait(success(BackupSnapshotDispatchTask::addTask(
			    tr, taskBucket, task, 1, TaskCompletionKey::joinWith(backupFinished))));
		}
		wait(success(BackupLogsDispatchTask::addTask(
		    tr, taskBucket, task, 1, 0, beginVersion, TaskCompletionKey::joinWith(backupFinished))));

		// If a clean stop is requested, the log and snapshot tasks will quit after the backup is restorable, then
		// the following task will clean up and set the completed state.
		wait(success(
		    FileBackupFinishedTask::addTask(tr, taskBucket, task, TaskCompletionKey::noSignal(), backupFinished)));

		wait(taskBucket->finish(tr, task));

		return Void();
	}

	ACTOR static Future<Key> addTask(Reference<ReadYourWritesTransaction> tr,
	                                 Reference<TaskBucket> taskBucket,
	                                 UID uid,
	                                 TaskCompletionKey completionKey,
	                                 Reference<TaskFuture> waitFor = Reference<TaskFuture>()) {
		Key key = wait(addBackupTask(StartFullBackupTaskFunc::name,
		                             StartFullBackupTaskFunc::version,
		                             tr,
		                             taskBucket,
		                             completionKey,
		                             BackupConfig(uid),
		                             waitFor));
		return key;
	}

	StringRef getName() const override { return name; };

	Future<Void> execute(Database cx,
	                     Reference<TaskBucket> tb,
	                     Reference<FutureBucket> fb,
	                     Reference<Task> task) override {
		return _execute(cx, tb, fb, task);
	};
	Future<Void> finish(Reference<ReadYourWritesTransaction> tr,
	                    Reference<TaskBucket> tb,
	                    Reference<FutureBucket> fb,
	                    Reference<Task> task) override {
		return _finish(tr, tb, fb, task);
	};
};
StringRef StartFullBackupTaskFunc::name = "file_backup_start_5.2"_sr;
REGISTER_TASKFUNC(StartFullBackupTaskFunc);

struct RestoreCompleteTaskFunc : RestoreTaskFuncBase {
	ACTOR static Future<Void> _finish(Reference<ReadYourWritesTransaction> tr,
	                                  Reference<TaskBucket> taskBucket,
	                                  Reference<FutureBucket> futureBucket,
	                                  Reference<Task> task) {

		wait(checkTaskVersion(tr->getDatabase(), task, name, version));

		state RestoreConfig restore(task);
		restore.stateEnum().set(tr, ERestoreState::COMPLETED);
		state bool unlockDB = wait(restore.unlockDBAfterRestore().getD(tr, Snapshot::False, true));

		tr->atomicOp(metadataVersionKey, metadataVersionRequiredValue, MutationRef::SetVersionstampedValue);
		// Clear the file map now since it could be huge.
		restore.fileSet().clear(tr);

		// TODO:  Validate that the range version map has exactly the restored ranges in it.  This means that for
		// any restore operation the ranges to restore must be within the backed up ranges, otherwise from the
		// restore perspective it will appear that some key ranges were missing and so the backup set is incomplete
		// and the restore has failed. This validation cannot be done currently because Restore only supports a
		// single restore range but backups can have many ranges.

		// Clear the applyMutations stuff, including any unapplied mutations from versions beyond the restored
		// version.
		restore.clearApplyMutationsKeys(tr);

		wait(taskBucket->finish(tr, task));

		if (unlockDB) {
			wait(unlockDatabase(tr, restore.getUid()));
		}

		bool bgRestoreEnabled = wait(restore.isBlobGranuleRestore().getD(tr));
		if (bgRestoreEnabled) {
			// No support to access historical data after blob granules restore. so cleanup the following keys
			TraceEvent("BlobGranuleRestoreCleanup").log();
			CODE_PROBE(true, "Cleanup blob granules keys after restore");
			tr->clear(blobGranuleHistoryKeys);
			tr->clear(blobGranuleFileKeys);
			tr->clear(blobGranuleMappingKeys);
			tr->clear(blobGranuleLockKeys);
			BlobGranuleRestoreConfig().phase().set(tr, BlobRestorePhase::DONE);
			BlobGranuleRestoreConfig().phaseStartTs().set(tr, BlobRestorePhase::DONE, now());
		}

		return Void();
	}

	ACTOR static Future<Key> addTask(Reference<ReadYourWritesTransaction> tr,
	                                 Reference<TaskBucket> taskBucket,
	                                 Reference<Task> parentTask,
	                                 TaskCompletionKey completionKey,
	                                 Reference<TaskFuture> waitFor = Reference<TaskFuture>()) {
		Key doneKey = wait(completionKey.get(tr, taskBucket));
		state Reference<Task> task(new Task(RestoreCompleteTaskFunc::name, RestoreCompleteTaskFunc::version, doneKey));

		// Get restore config from parent task and bind it to new task
		wait(RestoreConfig(parentTask).toTask(tr, task));

		if (!waitFor) {
			return taskBucket->addTask(tr, task);
		}

		wait(waitFor->onSetAddTask(tr, taskBucket, task));
		return "OnSetAddTask"_sr;
	}

	static StringRef name;
	static constexpr uint32_t version = 1;
	StringRef getName() const override { return name; };

	Future<Void> execute(Database cx,
	                     Reference<TaskBucket> tb,
	                     Reference<FutureBucket> fb,
	                     Reference<Task> task) override {
		return Void();
	};
	Future<Void> finish(Reference<ReadYourWritesTransaction> tr,
	                    Reference<TaskBucket> tb,
	                    Reference<FutureBucket> fb,
	                    Reference<Task> task) override {
		return _finish(tr, tb, fb, task);
	};
};
StringRef RestoreCompleteTaskFunc::name = "restore_complete"_sr;
REGISTER_TASKFUNC(RestoreCompleteTaskFunc);

struct RestoreFileTaskFuncBase : RestoreTaskFuncBase {
	struct InputParams {
		static TaskParam<RestoreFile> inputFile() { return __FUNCTION__sr; }
		static TaskParam<int64_t> readOffset() { return __FUNCTION__sr; }
		static TaskParam<int64_t> readLen() { return __FUNCTION__sr; }
	} Params;

	std::string toString(Reference<Task> task) const override {
		return format("fileName '%s' readLen %lld readOffset %lld",
		              Params.inputFile().get(task).fileName.c_str(),
		              Params.readLen().get(task),
		              Params.readOffset().get(task));
	}
};

struct RestoreRangeTaskFunc : RestoreFileTaskFuncBase {
	static struct : InputParams {
		// The range of data that the (possibly empty) data represented, which is set if it intersects the target
		// restore range
		static TaskParam<KeyRange> originalFileRange() { return __FUNCTION__sr; }
		static TaskParam<std::vector<KeyRange>> originalFileRanges() { return __FUNCTION__sr; }

		static std::vector<KeyRange> getOriginalFileRanges(Reference<Task> task) {
			if (originalFileRanges().exists(task)) {
				return Params.originalFileRanges().get(task);
			} else {
				std::vector<KeyRange> range;
				if (originalFileRange().exists(task))
					range.push_back(Params.originalFileRange().get(task));
				return range;
			}
		}
	} Params;

	std::string toString(Reference<Task> task) const override {
		std::string returnStr = RestoreFileTaskFuncBase::toString(task);
		for (auto& range : Params.getOriginalFileRanges(task))
			returnStr += format("  originalFileRange '%s'", printable(range).c_str());
		return returnStr;
	}

	ACTOR static Future<Void> _validTenantAccess(KeyRef key, Reference<TenantEntryCache<Void>> tenantCache) {
		if (isSystemKey(key)) {
			return Void();
		}
		state int64_t tenantId = TenantAPI::extractTenantIdFromKeyRef(key);
		Optional<TenantEntryCachePayload<Void>> payload = wait(tenantCache->getById(tenantId));
		ASSERT(payload.present());
		return Void();
	}

	ACTOR static Future<Void> _execute(Database cx,
	                                   Reference<TaskBucket> taskBucket,
	                                   Reference<FutureBucket> futureBucket,
	                                   Reference<Task> task) {
		state RestoreConfig restore(task);

		state RestoreFile rangeFile = Params.inputFile().get(task);
		state int64_t readOffset = Params.readOffset().get(task);
		state int64_t readLen = Params.readLen().get(task);

		TraceEvent("FileRestoreRangeStart")
		    .suppressFor(60)
		    .detail("RestoreUID", restore.getUid())
		    .detail("FileName", rangeFile.fileName)
		    .detail("FileVersion", rangeFile.version)
		    .detail("FileSize", rangeFile.fileSize)
		    .detail("ReadOffset", readOffset)
		    .detail("ReadLen", readLen)
		    .detail("TaskInstance", THIS_ADDR);

		state Reference<ReadYourWritesTransaction> tr(new ReadYourWritesTransaction(cx));
		state Future<Reference<IBackupContainer>> bc;
		state Future<std::vector<KeyRange>> restoreRanges;
		state Future<Key> addPrefix;
		state Future<Key> removePrefix;

		loop {
			try {
				tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
				tr->setOption(FDBTransactionOptions::LOCK_AWARE);

				Reference<IBackupContainer> _bc = wait(restore.sourceContainer().getOrThrow(tr));
				bc = getBackupContainerWithProxy(_bc);
				restoreRanges = restore.getRestoreRangesOrDefault(tr);
				addPrefix = restore.addPrefix().getD(tr);
				removePrefix = restore.removePrefix().getD(tr);

				wait(taskBucket->keepRunning(tr, task));

				wait(success(bc) && success(restoreRanges) && success(addPrefix) && success(removePrefix) &&
				     checkTaskVersion(tr->getDatabase(), task, name, version));
				break;

			} catch (Error& e) {
				wait(tr->onError(e));
			}
		}

		state Reference<IAsyncFile> inFile = wait(bc.get()->readFile(rangeFile.fileName));
		state Standalone<VectorRef<KeyValueRef>> blockData;
		try {
			// data is each real KV, not encoded mutations
			Standalone<VectorRef<KeyValueRef>> data = wait(decodeRangeFileBlock(inFile, readOffset, readLen, cx));
			blockData = data;
		} catch (Error& e) {
			// It's possible a tenant was deleted and the encrypt key fetch failed
			if (e.code() == error_code_encrypt_keys_fetch_failed || e.code() == error_code_tenant_not_found ||
			    e.code() == error_code_encrypt_key_not_found) {
				return Void();
			}
			throw;
		}
		state Optional<Reference<TenantEntryCache<Void>>> tenantCache;
		state std::vector<Future<Void>> validTenantCheckFutures;
		state Arena arena;
		state DatabaseConfiguration config = wait(getDatabaseConfiguration(cx));
		if (config.tenantMode == TenantMode::REQUIRED && g_network && g_network->isSimulated()) {
			tenantCache = makeReference<TenantEntryCache<Void>>(cx, TenantEntryCacheRefreshMode::WATCH);
			wait(tenantCache.get()->init());
		}

		// First and last key are the range for this file
		state KeyRange fileRange;
		state std::vector<KeyRange> originalFileRanges;
		// If fileRange doesn't intersect restore range then we're done.
		state int index;
		for (index = 0; index < restoreRanges.get().size(); index++) {
			auto& restoreRange = restoreRanges.get()[index];
			fileRange = KeyRangeRef(blockData.front().key, blockData.back().key);
			if (!fileRange.intersects(restoreRange))
				continue;

			// We know the file range intersects the restore range but there could still be keys outside the restore
			// range. Find the subvector of kv pairs that intersect the restore range.  Note that the first and last
			// keys are just the range endpoints for this file
			int rangeStart = 1;
			int rangeEnd = blockData.size() - 1;
			// Slide start forward, stop if something in range is found
			while (rangeStart < rangeEnd && !restoreRange.contains(blockData[rangeStart].key))
				++rangeStart;
			// Side end backward, stop if something in range is found
			while (rangeEnd > rangeStart && !restoreRange.contains(blockData[rangeEnd - 1].key))
				--rangeEnd;

			state VectorRef<KeyValueRef> data = blockData.slice(rangeStart, rangeEnd);

			// Shrink file range to be entirely within restoreRange and translate it to the new prefix
			// First, use the untranslated file range to create the shrunk original file range which must be used in
			// the kv range version map for applying mutations
			state KeyRange originalFileRange =
			    KeyRangeRef(std::max(fileRange.begin, restoreRange.begin), std::min(fileRange.end, restoreRange.end));
			originalFileRanges.push_back(originalFileRange);

			// Now shrink and translate fileRange
			Key fileEnd = std::min(fileRange.end, restoreRange.end);
			if (fileEnd == (removePrefix.get() == StringRef() ? allKeys.end : strinc(removePrefix.get()))) {
				fileEnd = addPrefix.get() == StringRef() ? allKeys.end : strinc(addPrefix.get());
			} else {
				fileEnd = fileEnd.removePrefix(removePrefix.get()).withPrefix(addPrefix.get());
			}
			fileRange = KeyRangeRef(std::max(fileRange.begin, restoreRange.begin)
			                            .removePrefix(removePrefix.get())
			                            .withPrefix(addPrefix.get()),
			                        fileEnd);

			state int start = 0;
			state int end = data.size();
			state int dataSizeLimit =
			    BUGGIFY ? deterministicRandom()->randomInt(256 * 1024, 10e6) : CLIENT_KNOBS->RESTORE_WRITE_TX_SIZE;

			tr->reset();
			loop {
				try {
					tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
					tr->setOption(FDBTransactionOptions::LOCK_AWARE);

					state int i = start;
					state int txBytes = 0;
					state int iend = start;

					// find iend that results in the desired transaction size
					for (; iend < end && txBytes < dataSizeLimit; ++iend) {
						txBytes += data[iend].key.expectedSize();
						txBytes += data[iend].value.expectedSize();
					}

					// Clear the range we are about to set.
					// If start == 0 then use fileBegin for the start of the range, else data[start]
					// If iend == end then use fileEnd for the end of the range, else data[iend]
					// clear the raw key(without alog prefix)
					state KeyRange trRange = KeyRangeRef(
					    (start == 0) ? fileRange.begin
					                 : data[start].key.removePrefix(removePrefix.get()).withPrefix(addPrefix.get()),
					    (iend == end) ? fileRange.end
					                  : data[iend].key.removePrefix(removePrefix.get()).withPrefix(addPrefix.get()));
					tr->clear(trRange);

					for (; i < iend; ++i) {
						tr->setOption(FDBTransactionOptions::NEXT_WRITE_NO_WRITE_CONFLICT_RANGE);
						if (tenantCache.present()) {
							validTenantCheckFutures.push_back(_validTenantAccess(
							    StringRef(arena,
							              data[i].key.removePrefix(removePrefix.get()).withPrefix(addPrefix.get())),
							    tenantCache.get()));
						}
						tr->set(data[i].key.removePrefix(removePrefix.get()).withPrefix(addPrefix.get()),
						        data[i].value);
					}

					// Add to bytes written count
					restore.bytesWritten().atomicOp(tr, txBytes, MutationRef::Type::AddValue);

					state Future<Void> checkLock = checkDatabaseLock(tr, restore.getUid());

					wait(taskBucket->keepRunning(tr, task));

					wait(checkLock);

					wait(tr->commit());

					if (!validTenantCheckFutures.empty()) {
						waitForAll(validTenantCheckFutures);
						validTenantCheckFutures.clear();
					}

					TraceEvent("FileRestoreCommittedRange")
					    .suppressFor(60)
					    .detail("RestoreUID", restore.getUid())
					    .detail("FileName", rangeFile.fileName)
					    .detail("FileVersion", rangeFile.version)
					    .detail("FileSize", rangeFile.fileSize)
					    .detail("ReadOffset", readOffset)
					    .detail("ReadLen", readLen)
					    .detail("CommitVersion", tr->getCommittedVersion())
					    .detail("BeginRange", trRange.begin)
					    .detail("EndRange", trRange.end)
					    .detail("StartIndex", start)
					    .detail("EndIndex", i)
					    .detail("DataSize", data.size())
					    .detail("Bytes", txBytes)
					    .detail("OriginalFileRange", originalFileRange)
					    .detail("TaskInstance", THIS_ADDR);

					// Commit succeeded, so advance starting point
					start = i;

					if (start == end)
						break;
					tr->reset();
				} catch (Error& e) {
					if (e.code() == error_code_transaction_too_large)
						dataSizeLimit /= 2;
					else
						wait(tr->onError(e));
				}
			}
		}
		if (!originalFileRanges.empty()) {
			if (BUGGIFY && restoreRanges.get().size() == 1) {
				Params.originalFileRange().set(task, originalFileRanges[0]);
			} else {
				Params.originalFileRanges().set(task, originalFileRanges);
			}
		}
		return Void();
	}

	ACTOR static Future<Void> _finish(Reference<ReadYourWritesTransaction> tr,
	                                  Reference<TaskBucket> taskBucket,
	                                  Reference<FutureBucket> futureBucket,
	                                  Reference<Task> task) {
		state RestoreConfig restore(task);
		restore.fileBlocksFinished().atomicOp(tr, 1, MutationRef::Type::AddValue);

		// Update the KV range map if originalFileRange is set
		std::vector<Future<Void>> updateMap;
		std::vector<KeyRange> ranges = Params.getOriginalFileRanges(task);
		// if to restore((a, b), (e, f), (x, y)), then there are 3 ranges
		for (auto& range : ranges) {
			Value versionEncoded = BinaryWriter::toValue(Params.inputFile().get(task).version, Unversioned());
			updateMap.push_back(krmSetRange(tr, restore.applyMutationsMapPrefix(), range, versionEncoded));
		}

		state Reference<TaskFuture> taskFuture = futureBucket->unpack(task->params[Task::reservedTaskParamKeyDone]);
		wait(taskFuture->set(tr, taskBucket) && taskBucket->finish(tr, task) && waitForAll(updateMap));

		return Void();
	}

	ACTOR static Future<Key> addTask(Reference<ReadYourWritesTransaction> tr,
	                                 Reference<TaskBucket> taskBucket,
	                                 Reference<Task> parentTask,
	                                 RestoreFile rf,
	                                 int64_t offset,
	                                 int64_t len,
	                                 TaskCompletionKey completionKey,
	                                 Reference<TaskFuture> waitFor = Reference<TaskFuture>()) {
		Key doneKey = wait(completionKey.get(tr, taskBucket));
		state Reference<Task> task(new Task(RestoreRangeTaskFunc::name, RestoreRangeTaskFunc::version, doneKey));

		// Create a restore config from the current task and bind it to the new task.
		wait(RestoreConfig(parentTask).toTask(tr, task));

		Params.inputFile().set(task, rf);
		Params.readOffset().set(task, offset);
		Params.readLen().set(task, len);

		if (!waitFor) {
			return taskBucket->addTask(tr, task);
		}

		wait(waitFor->onSetAddTask(tr, taskBucket, task));
		return "OnSetAddTask"_sr;
	}

	static StringRef name;
	static constexpr uint32_t version = 1;
	StringRef getName() const override { return name; };

	Future<Void> execute(Database cx,
	                     Reference<TaskBucket> tb,
	                     Reference<FutureBucket> fb,
	                     Reference<Task> task) override {
		return _execute(cx, tb, fb, task);
	};
	Future<Void> finish(Reference<ReadYourWritesTransaction> tr,
	                    Reference<TaskBucket> tb,
	                    Reference<FutureBucket> fb,
	                    Reference<Task> task) override {
		return _finish(tr, tb, fb, task);
	};
};
StringRef RestoreRangeTaskFunc::name = "restore_range_data"_sr;
REGISTER_TASKFUNC(RestoreRangeTaskFunc);

// Decodes a mutation log key, which contains (hash, commitVersion, chunkNumber) and
// returns (commitVersion, chunkNumber)
std::pair<Version, int32_t> decodeMutationLogKey(const StringRef& key) {
	ASSERT(key.size() == sizeof(uint8_t) + sizeof(Version) + sizeof(int32_t));

	uint8_t hash;
	Version version;
	int32_t part;
	BinaryReader rd(key, Unversioned());
	rd >> hash >> version >> part;
	version = bigEndian64(version);
	part = bigEndian32(part);

	int32_t v = version / CLIENT_KNOBS->LOG_RANGE_BLOCK_SIZE;
	ASSERT(((uint8_t)hashlittle(&v, sizeof(v), 0)) == hash);

	return std::make_pair(version, part);
}

// Decodes an encoded list of mutations in the format of:
//   [includeVersion:uint64_t][val_length:uint32_t][mutation_1][mutation_2]...[mutation_k],
// where a mutation is encoded as:
//   [type:uint32_t][keyLength:uint32_t][valueLength:uint32_t][param1][param2]
// noted version needs to be included here(0x0FDB00A200090001)
std::vector<MutationRef> decodeMutationLogValue(const StringRef& value) {
	StringRefReader reader(value, restore_corrupted_data());

	Version protocolVersion = reader.consume<uint64_t>();
	if (protocolVersion <= 0x0FDB00A200090001) {
		throw incompatible_protocol_version();
	}

	uint32_t val_length = reader.consume<uint32_t>();
	if (val_length != value.size() - sizeof(uint64_t) - sizeof(uint32_t)) {
		TraceEvent(SevError, "FileRestoreLogValueError")
		    .detail("ValueLen", val_length)
		    .detail("ValueSize", value.size())
		    .detail("Value", printable(value));
	}

	std::vector<MutationRef> mutations;
	while (1) {
		if (reader.eof())
			break;

		// Deserialization of a MutationRef, which was packed by MutationListRef::push_back_deep()
		uint32_t type, p1len, p2len;
		type = reader.consume<uint32_t>();
		p1len = reader.consume<uint32_t>();
		p2len = reader.consume<uint32_t>();

		const uint8_t* key = reader.consume(p1len);
		const uint8_t* val = reader.consume(p2len);

		mutations.emplace_back((MutationRef::Type)type, StringRef(key, p1len), StringRef(val, p2len));
	}
	return mutations;
}

void AccumulatedMutations::addChunk(int chunkNumber, const KeyValueRef& kv) {
	// here it validates that partition(chunk) number has to be continuous
	if (chunkNumber == lastChunkNumber + 1) {
		lastChunkNumber = chunkNumber;
		serializedMutations += kv.value.toString();
	} else {
		lastChunkNumber = -2;
		serializedMutations.clear();
	}
	kvs.push_back(kv);
}

bool AccumulatedMutations::isComplete() const {
	if (lastChunkNumber >= 0) {
		StringRefReader reader(serializedMutations, restore_corrupted_data());

		Version protocolVersion = reader.consume<uint64_t>();
		if (protocolVersion <= 0x0FDB00A200090001) {
			throw incompatible_protocol_version();
		}

		uint32_t vLen = reader.consume<uint32_t>();
		return vLen == reader.remainder().size();
	}

	return false;
}

// Returns true if a complete chunk contains any MutationRefs which intersect with any
// range in ranges.
// It is undefined behavior to run this if isComplete() does not return true.
bool AccumulatedMutations::matchesAnyRange(const RangeMapFilters& filters) const {
	// decode param2, so that each actual mutations are in mutations variable
	std::vector<MutationRef> mutations = decodeMutationLogValue(serializedMutations);
	for (auto& m : mutations) {
		if (m.type == MutationRef::Encrypted) {
			// TODO:  In order to filter out encrypted mutations that are not relevant to the
			// target range, they would have to be decrypted here in order to check relevance
			// below, however the staged mutations would still need to remain encrypted for
			// staging into the destination database.  Without decrypting, we must assume that
			// some data could match the range and return true here.
			return true;
		}
		if (filters.match(m)) {
			return true;
		}
	}

	return false;
}

bool RangeMapFilters::match(const MutationRef& m) const {
	if (isSingleKeyMutation((MutationRef::Type)m.type)) {
		if (match(singleKeyRange(m.param1))) {
			return true;
		}
	} else if (m.type == MutationRef::ClearRange) {
		if (match(KeyRangeRef(m.param1, m.param2))) {
			return true;
		}
	} else {
		ASSERT(false);
	}
	return false;
}

bool RangeMapFilters::match(const KeyValueRef& kv) const {
	return match(singleKeyRange(kv.key));
}

bool RangeMapFilters::match(const KeyRangeRef& range) const {
	auto ranges = rangeMap.intersectingRanges(range);
	for (const auto& r : ranges) {
		if (r.cvalue() == 1) {
			return true;
		}
	}
	return false;
}

// Returns a vector of filtered KV refs from data which are either part of incomplete mutation groups OR complete
// and have data relevant to one of the KV ranges in ranges
std::vector<KeyValueRef> filterLogMutationKVPairs(VectorRef<KeyValueRef> data, const RangeMapFilters& filters) {
	std::unordered_map<Version, AccumulatedMutations> mutationBlocksByVersion;

	// group mutations by version
	for (auto& kv : data) {
		// each kv is a [param1, param2]
		auto versionAndChunkNumber = decodeMutationLogKey(kv.key);
		mutationBlocksByVersion[versionAndChunkNumber.first].addChunk(versionAndChunkNumber.second, kv);
	}

	std::vector<KeyValueRef> output;

	// then add each version to the output, and now each K in output is also a KeyValueRef,
	// but mutations of the same versions stay together
	for (auto& vb : mutationBlocksByVersion) {
		AccumulatedMutations& m = vb.second;

		// If the mutations are incomplete or match one of the ranges, include in results.
		if (!m.isComplete() || m.matchesAnyRange(filters)) {
			output.insert(output.end(), m.kvs.begin(), m.kvs.end());
		}
	}

	return output;
}
struct RestoreLogDataTaskFunc : RestoreFileTaskFuncBase {
	static StringRef name;
	static constexpr uint32_t version = 1;
	StringRef getName() const override { return name; };

	static struct : InputParams {
	} Params;

	ACTOR static Future<Void> _execute(Database cx,
	                                   Reference<TaskBucket> taskBucket,
	                                   Reference<FutureBucket> futureBucket,
	                                   Reference<Task> task) {
		state RestoreConfig restore(task);

		state RestoreFile logFile = Params.inputFile().get(task);
		state int64_t readOffset = Params.readOffset().get(task);
		state int64_t readLen = Params.readLen().get(task);

		TraceEvent("FileRestoreLogStart")
		    .suppressFor(60)
		    .detail("RestoreUID", restore.getUid())
		    .detail("FileName", logFile.fileName)
		    .detail("FileBeginVersion", logFile.version)
		    .detail("FileEndVersion", logFile.endVersion)
		    .detail("FileSize", logFile.fileSize)
		    .detail("ReadOffset", readOffset)
		    .detail("ReadLen", readLen)
		    .detail("TaskInstance", THIS_ADDR);

		state Reference<ReadYourWritesTransaction> tr(new ReadYourWritesTransaction(cx));
		state Reference<IBackupContainer> bc;
		state std::vector<KeyRange> ranges;

		loop {
			try {
				tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
				tr->setOption(FDBTransactionOptions::LOCK_AWARE);

				Reference<IBackupContainer> _bc = wait(restore.sourceContainer().getOrThrow(tr));
				bc = getBackupContainerWithProxy(_bc);

				wait(store(ranges, restore.getRestoreRangesOrDefault(tr)));

				wait(checkTaskVersion(tr->getDatabase(), task, name, version));
				wait(taskBucket->keepRunning(tr, task));

				break;
			} catch (Error& e) {
				wait(tr->onError(e));
			}
		}

		state Key mutationLogPrefix = restore.mutationLogPrefix();
		state Reference<IAsyncFile> inFile = wait(bc->readFile(logFile.fileName));
		state Standalone<VectorRef<KeyValueRef>> dataOriginal =
		    wait(decodeMutationLogFileBlock(inFile, readOffset, readLen));

		// Filter the KV pairs extracted from the log file block to remove any records known to not be needed for
		// this restore based on the restore range set.
		RangeMapFilters filters(ranges);
		state std::vector<KeyValueRef> dataFiltered = filterLogMutationKVPairs(dataOriginal, filters);

		state int start = 0;
		state int end = dataFiltered.size();
		state int dataSizeLimit =
		    BUGGIFY ? deterministicRandom()->randomInt(256 * 1024, 10e6) : CLIENT_KNOBS->RESTORE_WRITE_TX_SIZE;

		tr->reset();
		loop {
			try {
				if (start == end)
					return Void();
				tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
				tr->setOption(FDBTransactionOptions::LOCK_AWARE);

				state int i = start;
				state int txBytes = 0;
				for (; i < end && txBytes < dataSizeLimit; ++i) {
					Key k = dataFiltered[i].key.withPrefix(mutationLogPrefix);
					ValueRef v = dataFiltered[i].value; // each KV is a [param1 with added prefix -> param2]
					tr->set(k, v);
					txBytes += k.expectedSize();
					txBytes += v.expectedSize();
				}

				state Future<Void> checkLock = checkDatabaseLock(tr, restore.getUid());

				wait(taskBucket->keepRunning(tr, task));
				wait(checkLock);

				// Add to bytes written count
				restore.bytesWritten().atomicOp(tr, txBytes, MutationRef::Type::AddValue);

				wait(tr->commit());

				TraceEvent("FileRestoreCommittedLog")
				    .suppressFor(60)
				    .detail("RestoreUID", restore.getUid())
				    .detail("FileName", logFile.fileName)
				    .detail("FileBeginVersion", logFile.version)
				    .detail("FileEndVersion", logFile.endVersion)
				    .detail("FileSize", logFile.fileSize)
				    .detail("ReadOffset", readOffset)
				    .detail("ReadLen", readLen)
				    .detail("CommitVersion", tr->getCommittedVersion())
				    .detail("StartIndex", start)
				    .detail("EndIndex", i)
				    .detail("RecordCountOriginal", dataOriginal.size())
				    .detail("RecordCountFiltered", dataFiltered.size())
				    .detail("Bytes", txBytes)
				    .detail("TaskInstance", THIS_ADDR);

				// Commit succeeded, so advance starting point
				start = i;
				tr->reset();
			} catch (Error& e) {
				if (e.code() == error_code_transaction_too_large)
					dataSizeLimit /= 2;
				else
					wait(tr->onError(e));
			}
		}
	}

	ACTOR static Future<Void> _finish(Reference<ReadYourWritesTransaction> tr,
	                                  Reference<TaskBucket> taskBucket,
	                                  Reference<FutureBucket> futureBucket,
	                                  Reference<Task> task) {
		RestoreConfig(task).fileBlocksFinished().atomicOp(tr, 1, MutationRef::Type::AddValue);

		state Reference<TaskFuture> taskFuture = futureBucket->unpack(task->params[Task::reservedTaskParamKeyDone]);

		// TODO:  Check to see if there is a leak in the FutureBucket since an invalid task (validation key fails)
		// will never set its taskFuture.
		wait(taskFuture->set(tr, taskBucket) && taskBucket->finish(tr, task));

		return Void();
	}

	ACTOR static Future<Key> addTask(Reference<ReadYourWritesTransaction> tr,
	                                 Reference<TaskBucket> taskBucket,
	                                 Reference<Task> parentTask,
	                                 RestoreFile lf,
	                                 int64_t offset,
	                                 int64_t len,
	                                 TaskCompletionKey completionKey,
	                                 Reference<TaskFuture> waitFor = Reference<TaskFuture>()) {
		Key doneKey = wait(completionKey.get(tr, taskBucket));
		state Reference<Task> task(new Task(RestoreLogDataTaskFunc::name, RestoreLogDataTaskFunc::version, doneKey));

		// Create a restore config from the current task and bind it to the new task.
		// RestoreConfig(parentTask) creates prefix of : fileRestorePrefixRange.begin/uid->config/[uid]
		wait(RestoreConfig(parentTask).toTask(tr, task));
		Params.inputFile().set(task, lf);
		Params.readOffset().set(task, offset);
		Params.readLen().set(task, len);

		if (!waitFor) {
			return taskBucket->addTask(tr, task);
		}

		wait(waitFor->onSetAddTask(tr, taskBucket, task));
		return "OnSetAddTask"_sr;
	}

	Future<Void> execute(Database cx,
	                     Reference<TaskBucket> tb,
	                     Reference<FutureBucket> fb,
	                     Reference<Task> task) override {
		return _execute(cx, tb, fb, task);
	};
	Future<Void> finish(Reference<ReadYourWritesTransaction> tr,
	                    Reference<TaskBucket> tb,
	                    Reference<FutureBucket> fb,
	                    Reference<Task> task) override {
		return _finish(tr, tb, fb, task);
	};
};
StringRef RestoreLogDataTaskFunc::name = "restore_log_data"_sr;
REGISTER_TASKFUNC(RestoreLogDataTaskFunc);

// type|kLen|vLen|Key|Value
// ref: decodeBackupLogValue()
Standalone<StringRef> transformMutationToOldFormat(MutationRef m) {
	BinaryWriter bw(Unversioned());
	uint32_t len1, len2, type;
	type = m.type;
	len1 = m.param1.size();
	len2 = m.param2.size();
	bw << type;
	bw << len1;
	bw << len2;
	// do not use <<, it is overloaded for stringref to write its size first
	bw.serializeBytes(m.param1);
	bw.serializeBytes(m.param2);
	return bw.toValue();
}

// this method takes a version and a list of list of mutations of this verison,
// each list is returned from a iterator sorted by sub
// it will first add all mutations in subsequence order
// then combine them in old-format (param1, parma2) and return
// this method assumes that iterator can return a list of mutations
/*
    mutations are serialized in file as below format:
        `<BlockHeader>`
        `<Version_1><Subseq_1><Mutation1_len><Mutation1>`
        `<Version_2><Subseq_2><Mutation2_len><Mutation2>`
        `…`
        `<Padding>

   for now, assume each iterator returns a vector<pair<subsequence, mutation> >
   noted that the mutation's arena has to be valid during the execution

   according to BackupWorker::addMutation, version 64-bit, sub is 32-bit and mutation length is 32-bit    So iterator
   will combine all mutations in the same version and return a vector iterator should also return the subsequence
   together with each mutation as here we will do another mergeSort for subsequence again to decide the order and here
   we will decode the stringref
*/
Standalone<VectorRef<KeyValueRef>> generateOldFormatMutations(
    Version commitVersion,
    const std::vector<Standalone<VectorRef<VersionedMutation>>>& newFormatMutations) {
	Standalone<VectorRef<KeyValueRef>> results;
	std::vector<Standalone<VectorRef<KeyValueRef>>> oldFormatMutations;
	// mergeSort subversion here
	// just do a global sort for everyone
	int32_t totalBytes = 0;
	std::map<uint32_t, std::vector<Standalone<StringRef>>> mutationsBySub;
	std::map<uint32_t, std::vector<Standalone<MutationRef>>> tmpMap;
	for (auto& eachTagMutations : newFormatMutations) {
		for (auto& vm : eachTagMutations) {
			uint32_t sub = vm.subsequence;
			Standalone<StringRef> mutationOldFormat = transformMutationToOldFormat(vm.mutation);
			mutationsBySub[sub].push_back(mutationOldFormat);
			tmpMap[sub].push_back(vm.mutation);
			totalBytes += mutationOldFormat.size();
		}
	}
	// the list of param2 needs to have the first 64 bites as 0x0FDB00A200090001
	BinaryWriter param2Writer(IncludeVersion(ProtocolVersion::withBackupMutations()));
	param2Writer << totalBytes;

	for (auto& mutationsForSub : mutationsBySub) {
		// concatenate them to param2Str
		for (auto& m : mutationsForSub.second) {
			// refer to transformMutationToOldFormat
			param2Writer.serializeBytes(m);
		}
	}
	Key param2Concat = param2Writer.toValue();

	// deal with param1
	int32_t hashBase = commitVersion / CLIENT_KNOBS->LOG_RANGE_BLOCK_SIZE;

	BinaryWriter wrParam1(Unversioned()); // hash/commitVersion/part
	wrParam1 << (uint8_t)hashlittle(&hashBase, sizeof(hashBase), 0);
	wrParam1 << bigEndian64(commitVersion);
	uint32_t* partBuffer = nullptr;

	// TODO: re-use the similar logic in CommitProxyServer::addBackupMutations
	// generate a list of (param1, param2)
	// param2 has format: length_of_the_mutation_group | encoded_mutation_1 | … | encoded_mutation_k
	// each mutation has format type|kLen|vLen|Key|Value
	for (int part = 0; part * CLIENT_KNOBS->MUTATION_BLOCK_SIZE < param2Concat.size(); part++) {
		KeyValueRef backupKV;
		// Assign the second parameter as the part
		backupKV.value = getBackupValue(param2Concat, part);
		Key key = getBackupKey(wrParam1, &partBuffer, part); // holds the memory
		backupKV.key = key;
		results.push_back_deep(results.arena(), backupKV);
	}
	return results;
}

struct RestoreLogDataPartitionedTaskFunc : RestoreFileTaskFuncBase {
	static StringRef name;
	static constexpr uint32_t version = 1;
	StringRef getName() const override { return name; };

	static struct {
		static TaskParam<int64_t> maxTagID() { return __FUNCTION__sr; }
		static TaskParam<Version> beginVersion() { return __FUNCTION__sr; }
		static TaskParam<Version> endVersion() { return __FUNCTION__sr; }
		static TaskParam<int64_t> bytesWritten() { return __FUNCTION__sr; }
		static TaskParam<std::vector<RestoreConfig::RestoreFile>> logs() { return __FUNCTION__sr; }
	} Params;

	// From all iterators, find the minimum version and return it together with if this version has data.
	ACTOR static Future<std::pair<Version, bool>> findNextVersion(
	    std::vector<Reference<PartitionedLogIterator>> iterators) {
		state Version minVersion = std::numeric_limits<int64_t>::max();
		state bool atLeastOneIteratorHasNext = false;
		state std::vector<Version> minVs(iterators.size(), 0); // can be removed or leave for debugging

		state int k = 0;
		for (; k < iterators.size(); k++) {
			if (!iterators[k]->hasNext()) {
				continue;
			}
			atLeastOneIteratorHasNext = true;
			Version v = wait(iterators[k]->peekNextVersion());
			minVs[k] = v;
			minVersion = std::min(minVersion, v);
		}
		return std::make_pair(minVersion, atLeastOneIteratorHasNext);
	}

	// Reads from all iterators and sends a list of mutations for the given version
	// to the mutationStream for consumption. This actor reads as fast as possible,
	// which is fine since the task is working on the version range of size
	// RESTORE_PARTITIONED_BATCH_VERSION_SIZE, thus limiting the amount of memory used.
	ACTOR static Future<Void> readLogData(PromiseStream<Standalone<VectorRef<KeyValueRef>>> mutationStream,
	                                      std::vector<Reference<PartitionedLogIterator>> iterators,
	                                      Version begin,
	                                      Version end) {
		// mergeSort all iterator until all are exhausted
		// it stores all mutations for the next min version, in new format
		state bool atLeastOneIteratorHasNext = true;
		state Version minVersion, lastMinVersion = invalidVersion;
		while (atLeastOneIteratorHasNext) {
			std::pair<Version, bool> minVersionAndHasNext = wait(findNextVersion(iterators));
			minVersion = minVersionAndHasNext.first;
			atLeastOneIteratorHasNext = minVersionAndHasNext.second;
			ASSERT_LT(lastMinVersion, minVersion);
			lastMinVersion = minVersion;

			if (atLeastOneIteratorHasNext) {
				std::vector<Standalone<VectorRef<VersionedMutation>>> mutationsSingleVersion =
				    wait(getMutationsForVersion(iterators, minVersion));

				if (minVersion < begin) {
					// skip generating mutations, because this is not within desired range
					// this is already handled by the previous taskfunc
					continue;
				} else if (minVersion >= end) {
					// all valid data has been consumed
					break;
				}

				// transform from new format to old format(param1, param2) for this version.
				// This transformation has to be done version by version.
				Standalone<VectorRef<KeyValueRef>> oldFormatMutations =
				    generateOldFormatMutations(minVersion, mutationsSingleVersion);
				mutationStream.send(oldFormatMutations);
			}
		}
		mutationStream.sendError(end_of_stream());
		return Void();
	}

	// Reads from all iterators and returns a list of mutations for the given version.
	ACTOR static Future<std::vector<Standalone<VectorRef<VersionedMutation>>>> getMutationsForVersion(
	    std::vector<Reference<PartitionedLogIterator>> iterators,
	    Version minVersion) {
		state std::vector<Standalone<VectorRef<VersionedMutation>>> mutationsSingleVersion;

		state int k = 0;
		for (; k < iterators.size(); k++) {
			if (!iterators[k]->hasNext()) {
				continue;
			}
			Version v = wait(iterators[k]->peekNextVersion());
			if (v == minVersion) {
				Standalone<VectorRef<VersionedMutation>> tmp = wait(iterators[k]->getNext());
				mutationsSingleVersion.push_back(tmp);
			}
		}
		return mutationsSingleVersion;
	}

	// Writes backup mutations to the database
	ACTOR static Future<Void> writeMutations(Database cx,
	                                         std::vector<Standalone<VectorRef<KeyValueRef>>> mutations,
	                                         Key mutationLogPrefix,
	                                         Reference<Task> task) {
		state Reference<ReadYourWritesTransaction> tr(new ReadYourWritesTransaction(cx));
		state Standalone<VectorRef<KeyValueRef>> oldFormatMutations;
		state int mutationIndex = 0;
		state int mutationCount = 0;
		state int txBytes = 0;
		state int txBytesLimit = CLIENT_KNOBS->RESTORE_WRITE_TX_SIZE;

		for (const auto& data : mutations) {
			oldFormatMutations.append(oldFormatMutations.arena(), data.begin(), data.size());
		}
		state int totalMutation = oldFormatMutations.size();

		// multiple transactions are needed, so this has to be in a _execute method rather than
		// a _finish method
		// this transaction does blind writes, so they are idempotent operations even if multiple instances of
		// the same tasks are running these transactions.
		loop {
			try {
				if (mutationIndex == totalMutation) {
					break;
				}
				txBytes = 0;
				mutationCount = 0;
				tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
				tr->setOption(FDBTransactionOptions::LOCK_AWARE);

				while (mutationIndex + mutationCount < totalMutation && txBytes < txBytesLimit) {
					Key k = oldFormatMutations[mutationIndex + mutationCount].key.withPrefix(mutationLogPrefix);
					ValueRef v = oldFormatMutations[mutationIndex + mutationCount]
					                 .value; // each KV is a [param1 with added prefix -> param2]
					tr->set(k, v);
					txBytes += k.expectedSize();
					txBytes += v.expectedSize();
					++mutationCount;
				}
				wait(tr->commit());
				int64_t oldBytes = Params.bytesWritten().get(task);
				Params.bytesWritten().set(task, oldBytes + txBytes);
				DisabledTraceEvent("FileRestorePartitionedLogCommittData")
				    .detail("TaskInstance", THIS_ADDR)
				    .detail("MutationIndex", mutationIndex)
				    .detail("MutationCount", mutationCount)
				    .detail("TotalMutation", totalMutation)
				    .detail("Bytes", txBytes);
				mutationIndex += mutationCount; // update mutationIndex after the commit succeeds
			} catch (Error& e) {
				if (e.code() == error_code_transaction_too_large) {
					txBytesLimit /= 2;
				} else {
					wait(tr->onError(e));
				}
			}
		}
		return Void();
	}

	ACTOR static Future<Void> _execute(Database cx,
	                                   Reference<TaskBucket> taskBucket,
	                                   Reference<FutureBucket> futureBucket,
	                                   Reference<Task> task) {
		state RestoreConfig restore(task);

		state int64_t maxTagID = Params.maxTagID().get(task);
		state std::vector<RestoreConfig::RestoreFile> logs = Params.logs().get(task);
		state Version begin = Params.beginVersion().get(task);
		state Version end = Params.endVersion().get(task);

		state Reference<ReadYourWritesTransaction> tr(new ReadYourWritesTransaction(cx));
		state Reference<IBackupContainer> bc;
		state std::vector<KeyRange> ranges; // this is the actual KV, not version
		loop {
			try {
				tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
				tr->setOption(FDBTransactionOptions::LOCK_AWARE);

				Reference<IBackupContainer> _bc = wait(restore.sourceContainer().getOrThrow(tr));
				bc = getBackupContainerWithProxy(_bc);

				wait(store(ranges, restore.getRestoreRangesOrDefault(tr)));

				wait(checkTaskVersion(tr->getDatabase(), task, name, version));
				wait(taskBucket->keepRunning(tr, task));

				break;
			} catch (Error& e) {
				wait(tr->onError(e));
			}
		}

		std::vector<std::vector<RestoreConfig::RestoreFile>> filesByTag(maxTagID + 1);
		std::vector<std::vector<Version>> fileEndVersionByTag(maxTagID + 1);

		for (RestoreConfig::RestoreFile& f : logs) {
			// find the tag, aggregate files by tags
			if (f.tagId == -1) {
				// inconsistent data
				TraceEvent(SevError, "PartitionedLogFileNoTag")
				    .detail("FileName", f.fileName)
				    .detail("FileSize", f.fileSize)
				    .log();
			} else {
				filesByTag[f.tagId].push_back(f);
			}
		}
		for (int i = 0; i < maxTagID + 1; i++) {
			std::vector<RestoreConfig::RestoreFile>& files = filesByTag[i];
			int cnt = files.size();
			fileEndVersionByTag[i].resize(cnt);
			for (int j = 0; j < cnt; j++) {
				// [10, 20), [18, 30) -> endVersion is (18, 30) for 2 files so we have [10, 18) and [18, 30)
				// greedy algorithm, because sorted by beginVersion and to minimize duplicate reading
				if (j != cnt - 1 && files[j].endVersion < files[j + 1].version) {
					TraceEvent(SevError, "NonContinuousLog").detail("Tag", i).detail("Index", j).log();
				}
				fileEndVersionByTag[i][j] = (j == cnt - 1 ? end : files[j + 1].version);
			}
		}

		state std::vector<Reference<PartitionedLogIterator>> iterators(maxTagID + 1);
		// for each tag, create an iterator
		for (int k = 0; k < filesByTag.size(); k++) {
			iterators[k] =
			    makeReference<PartitionedLogIteratorTwoBuffers>(bc, k, filesByTag[k], fileEndVersionByTag[k]);
		}

		DisabledTraceEvent("FileRestorePartitionedLogDataExeStart")
		    .detail("BeginVersion", begin)
		    .detail("EndVersion", end)
		    .detail("Files", logs.size())
		    .detail("TaskInstance", THIS_ADDR);

		// Converted backup mutations
		state PromiseStream<Standalone<VectorRef<KeyValueRef>>> mutationStream;
		state Future<Void> reader = readLogData(mutationStream, iterators, begin, end);

		state std::vector<Standalone<VectorRef<KeyValueRef>>> mutations;
		state int64_t totalBytes = 0;
		state Standalone<VectorRef<KeyValueRef>> oneVersionData;
		loop {
			try {
				Standalone<VectorRef<KeyValueRef>> _data = waitNext(mutationStream.getFuture());
				oneVersionData = _data;

				// batching mutations from multiple versions together before writing to the database
				state int64_t bytes = oneVersionData.expectedSize();
				if (totalBytes + bytes > CLIENT_KNOBS->RESTORE_WRITE_TX_SIZE) {
					wait(writeMutations(cx, mutations, restore.mutationLogPrefix(), task));
					mutations.clear();
					totalBytes = 0;
				}
				mutations.push_back(oneVersionData);
				totalBytes += bytes;
			} catch (Error& e) {
				if (e.code() == error_code_end_of_stream) {
					if (mutations.size() > 0) {
						wait(writeMutations(cx, mutations, restore.mutationLogPrefix(), task));
					}
					break;
				} else {
					throw;
				}
			}
		}
		DisabledTraceEvent("FileRestorePartitionedLogDataExeDone")
		    .detail("BeginVersion", begin)
		    .detail("EndVersion", end)
		    .detail("Files", logs.size())
		    .detail("TaskInstance", THIS_ADDR);

		return Void();
	}

	ACTOR static Future<Void> _finish(Reference<ReadYourWritesTransaction> tr,
	                                  Reference<TaskBucket> taskBucket,
	                                  Reference<FutureBucket> futureBucket,
	                                  Reference<Task> task) {
		state int64_t logBytesWritten = Params.bytesWritten().get(task);
		RestoreConfig(task).bytesWritten().atomicOp(tr, logBytesWritten, MutationRef::Type::AddValue);

		int64_t blocks =
		    (logBytesWritten + CLIENT_KNOBS->BACKUP_LOGFILE_BLOCK_SIZE - 1) / CLIENT_KNOBS->BACKUP_LOGFILE_BLOCK_SIZE;
		// When dispatching, we don't know how many blocks are there, so we have to do it here
		RestoreConfig(task).filesBlocksDispatched().atomicOp(tr, blocks, MutationRef::Type::AddValue);
		RestoreConfig(task).fileBlocksFinished().atomicOp(tr, blocks, MutationRef::Type::AddValue);

		DisabledTraceEvent("FileRestorePartitionedLogCommittedData")
		    .detail("TaskInstance", THIS_ADDR)
		    .detail("Blocks", blocks)
		    .detail("LogBytes", logBytesWritten);

		state Reference<TaskFuture> taskFuture = futureBucket->unpack(task->params[Task::reservedTaskParamKeyDone]);
		wait(taskFuture->set(tr, taskBucket) && taskBucket->finish(tr, task));

		return Void();
	}

	ACTOR static Future<Key> addTask(Reference<ReadYourWritesTransaction> tr,
	                                 Reference<TaskBucket> taskBucket,
	                                 Reference<Task> parentTask,
	                                 int64_t maxTagID,
	                                 std::vector<RestoreConfig::RestoreFile> logs,
	                                 Version begin,
	                                 Version end,
	                                 TaskCompletionKey completionKey,
	                                 Reference<TaskFuture> waitFor = Reference<TaskFuture>()) {
		Key doneKey = wait(completionKey.get(tr, taskBucket));
		state Reference<Task> task(
		    new Task(RestoreLogDataPartitionedTaskFunc::name, RestoreLogDataPartitionedTaskFunc::version, doneKey));

		// Create a restore config from the current task and bind it to the new task.
		// RestoreConfig(parentTask) createsa prefix of : fileRestorePrefixRange.begin/uid->config/[uid]
		wait(RestoreConfig(parentTask).toTask(tr, task));
		Params.maxTagID().set(task, maxTagID);
		Params.beginVersion().set(task, begin);
		Params.endVersion().set(task, end);
		Params.logs().set(task, logs);
		Params.bytesWritten().set(task, 0);

		if (!waitFor) {
			return taskBucket->addTask(tr, task);
		}

		wait(waitFor->onSetAddTask(tr, taskBucket, task));
		return "OnSetAddTask"_sr;
	}

	Future<Void> execute(Database cx,
	                     Reference<TaskBucket> tb,
	                     Reference<FutureBucket> fb,
	                     Reference<Task> task) override {
		return _execute(cx, tb, fb, task);
	};
	Future<Void> finish(Reference<ReadYourWritesTransaction> tr,
	                    Reference<TaskBucket> tb,
	                    Reference<FutureBucket> fb,
	                    Reference<Task> task) override {
		return _finish(tr, tb, fb, task);
	};
};
StringRef RestoreLogDataPartitionedTaskFunc::name = "restore_log_data_partitioned"_sr;
REGISTER_TASKFUNC(RestoreLogDataPartitionedTaskFunc);

// each task can be partitioned to smaller ranges because commit proxy would
// only start to commit alog/ prefix mutations to original prefix when
// the final version is set, but do it in a single task for now for simplicity
struct RestoreDispatchPartitionedTaskFunc : RestoreTaskFuncBase {
	static StringRef name;
	static constexpr uint32_t version = 1;
	StringRef getName() const override { return name; };

	static struct {
		static TaskParam<Version> beginVersion() { return __FUNCTION__sr; }
		static TaskParam<Version> firstVersion() { return __FUNCTION__sr; }
		static TaskParam<Version> endVersion() { return __FUNCTION__sr; }
	} Params;

	ACTOR static Future<Void> _finish(Reference<ReadYourWritesTransaction> tr,
	                                  Reference<TaskBucket> taskBucket,
	                                  Reference<FutureBucket> futureBucket,
	                                  Reference<Task> task) {
		state RestoreConfig restore(task);

		state Version beginVersion = Params.beginVersion().get(task);
		state Version firstVersion = Params.firstVersion().get(task);
		state Version endVersion = Params.endVersion().get(task);
		Reference<IBackupContainer> _bc = wait(restore.sourceContainer().getOrThrow(tr));
		state Reference<IBackupContainer> bc = getBackupContainerWithProxy(_bc);
		state Reference<TaskFuture> onDone = futureBucket->unpack(task->params[Task::reservedTaskParamKeyDone]);

		state Version restoreVersion;
		state int fileLimit = 1000;

		wait(store(restoreVersion, restore.restoreVersion().getOrThrow(tr)) &&
		     checkTaskVersion(tr->getDatabase(), task, name, version));

		// if current is [40, 50] and restore version is 50, we need another [50, 51] task to process data at version 50
		state Version nextEndVersion =
		    std::min(restoreVersion + 1, endVersion + CLIENT_KNOBS->RESTORE_PARTITIONED_BATCH_VERSION_SIZE);
		// update the apply mutations end version so the mutations from the previous batch can be applied.
		// Only do this once beginVersion is > 0 (it will be 0 for the initial dispatch).
		if (beginVersion > firstVersion) {
			// if the last file is [80, 100] and the restoreVersion is 90, we should use 90 here
			// this is an additional taskFunc after last file
			restore.setApplyEndVersion(tr, std::min(beginVersion, restoreVersion + 1));
		}

		// The applyLag must be retrieved AFTER potentially updating the apply end version.
		state int64_t applyLag = wait(restore.getApplyVersionLag(tr));
		// this is to guarantee commit proxy is catching up doing apply alog -> normal key
		// with this  backupFile -> alog process
		// If starting a new batch and the apply lag is too large then re-queue and wait
		if (applyLag > (BUGGIFY ? 1 : CLIENT_KNOBS->CORE_VERSIONSPERSECOND * 300)) {
			// Wait a small amount of time and then re-add this same task.
			wait(delay(FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY));
			wait(success(RestoreDispatchPartitionedTaskFunc::addTask(
			    tr, taskBucket, task, firstVersion, beginVersion, endVersion)));

			TraceEvent("FileRestorePartitionDispatch")
			    .detail("RestoreUID", restore.getUid())
			    .detail("BeginVersion", beginVersion)
			    .detail("ApplyLag", applyLag)
			    .detail("Decision", "too_far_behind")
			    .detail("TaskInstance", THIS_ADDR);

			wait(taskBucket->finish(tr, task));
			return Void();
		}

		// Get a batch of files.  We're targeting batchSize blocks(30k) being dispatched so query for batchSize(150)
		// files (each of which is 0 or more blocks).
		// lets say files have [10, 20), [20, 30) then if our range is [15, 25], we need to include both files,
		// because [15, 20] is included in the first file, and [20, 25] is included in the second file
		// say we have b and e
		// as a result, the first file(inclusive): largest file whose begin <= b,
		// the last file(exclusive): smallest file whose begin > e
		// because of the encoding, version comes before the type of files(log or range),
		// begin needs to start from the beginning --
		// if we have a 3 files, (log,100,200), (range, 180), (log, 200, 300), then if we want to restore[190, 250]
		// it would stop at (range, 180) and not looking at (log, 100, 200)
		state Optional<RestoreConfig::RestoreFile> beginLogInclude = Optional<RestoreConfig::RestoreFile>{};
		// because of the encoding of RestoreFile, we use greaterThanOrEqual(end + 1) instead of greaterThan(end)
		// because RestoreFile::pack has the version at the most significant position, and keyAfter(end) does not result
		// in a end+1
		state Optional<RestoreConfig::RestoreFile> endLogExclude = wait(
		    restore.logFileSet().seekGreaterOrEqual(tr, RestoreConfig::RestoreFile({ endVersion + 1, "", false })));
		state RestoreConfig::FileSetT::RangeResultType logFiles =
		    wait(restore.logFileSet().getRange(tr, beginLogInclude, endLogExclude, fileLimit));
		state Optional<RestoreConfig::RestoreFile> beginRangeInclude =
		    wait(restore.rangeFileSet().seekGreaterOrEqual(tr, RestoreConfig::RestoreFile({ beginVersion, "", true })));
		// greaterThanOrEqual(end + 1) instead of greaterThan(end)
		// because RestoreFile::pack has the version at the most significant position, and keyAfter(end) does not result
		// in a end+1
		state Optional<RestoreConfig::RestoreFile> endRangeExclude = wait(
		    restore.rangeFileSet().seekGreaterOrEqual(tr, RestoreConfig::RestoreFile({ endVersion + 1, "", true })));
		state RestoreConfig::FileSetT::RangeResultType rangeFiles =
		    wait(restore.rangeFileSet().getRange(tr, beginRangeInclude, endRangeExclude, fileLimit));
		state int64_t maxTagID = 0;
		state std::vector<RestoreConfig::RestoreFile> logs;
		state std::vector<RestoreConfig::RestoreFile> ranges;
		for (auto f : logFiles.results) {
			if (f.endVersion > beginVersion) {
				// skip all files whose endVersion is smaller or equal to beginVersion
				logs.push_back(f);
				maxTagID = std::max(maxTagID, f.tagId);
			}
		}
		for (auto f : rangeFiles.results) {
			// the getRange might get out-of-bound range file because log files need them to work
			if (f.version >= beginVersion && f.version < endVersion) {
				ranges.push_back(f);
			}
		}
		// allPartsDone will be set once all block tasks in the current batch are finished.
		// create a new future for the new batch
		state Reference<TaskFuture> allPartsDone = futureBucket->future(tr);
		restore.batchFuture().set(tr, allPartsDone->pack());

		// if there are no files, if i am not the last batch, then on to the next batch
		// if there are no files and i am the last batch, then just wait for applying to finish
		// do we need this files.results.size() == 0 at all?
		if (beginVersion > restoreVersion) {
			if (applyLag == 0) {
				// i am the last batch
				// If apply lag is 0 then we are done so create the completion task
				wait(success(RestoreCompleteTaskFunc::addTask(tr, taskBucket, task, TaskCompletionKey::noSignal())));

				TraceEvent("FileRestorePartitionDispatch")
				    .detail("RestoreUID", restore.getUid())
				    .detail("BeginVersion", beginVersion)
				    .detail("ApplyLag", applyLag)
				    .detail("Decision", "restore_complete")
				    .detail("TaskInstance", THIS_ADDR);
			} else {
				// i am the last batch, and applyLag is not zero, then I will create another dummy task to wait
				// for apply log to be zero, then it will go into the branch above.
				// Applying of mutations is not yet finished so wait a small amount of time and then re-add this
				// same task.
				// this is only to create a dummy one wait for it to finish
				wait(delay(FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY));
				wait(success(RestoreDispatchPartitionedTaskFunc::addTask(
				    tr, taskBucket, task, firstVersion, beginVersion, endVersion)));

				TraceEvent("FileRestorePartitionDispatch")
				    .detail("RestoreUID", restore.getUid())
				    .detail("BeginVersion", beginVersion)
				    .detail("ApplyLag", applyLag)
				    .detail("Decision", "apply_still_behind")
				    .detail("TaskInstance", THIS_ADDR);
			}
			wait(taskBucket->finish(tr, task));
			return Void();
		}

		// if we reach here, this batch is not empty(i.e. we have range and/or mutation files in this)
		// Start moving through the file list and queuing up blocks.  Only queue up to RESTORE_DISPATCH_ADDTASK_SIZE
		// blocks per Dispatch task and target batchSize total per batch but a batch must end on a complete version
		// boundary so exceed the limit if necessary to reach the end of a version of files.
		state std::vector<Future<Key>> addTaskFutures;
		state int i = 0;
		// need to process all range files, keep using the same RestoreRangeTaskFunc as non-partitioned restore.
		// this can be done first, because they are not overlap within a restore uid
		// each task will read the file, restore those key to their original keys after clear that range
		// also it will update the keyVersionMap[key -> versionFromRangeFile]
		// by this time, corresponding mutation files within the same version range has not been applied yet
		// because they are waiting for the singal of this RestoreDispatchPartitionedTaskFunc
		// when log are being applied, they will compare version of key to the keyVersionMap updated by range file
		// after each RestoreDispatchPartitionedTaskFunc, keyVersionMap will be clear if mutation version is larger.
		for (; i < ranges.size(); ++i) {
			RestoreConfig::RestoreFile& f = ranges[i];
			// For each block of the file
			for (int64_t j = 0; j < f.fileSize; j += f.blockSize) {
				addTaskFutures.push_back(RestoreRangeTaskFunc::addTask(tr,
				                                                       taskBucket,
				                                                       task,
				                                                       f,
				                                                       j,
				                                                       std::min<int64_t>(f.blockSize, f.fileSize - j),
				                                                       TaskCompletionKey::joinWith(allPartsDone)));
			}
		}
		addTaskFutures.push_back(RestoreLogDataPartitionedTaskFunc::addTask(
		    tr, taskBucket, task, maxTagID, logs, beginVersion, endVersion, TaskCompletionKey::joinWith(allPartsDone)));
		// even if file exsists, but they are empty, in this case just start the next batch

		addTaskFutures.push_back(RestoreDispatchPartitionedTaskFunc::addTask(tr,
		                                                                     taskBucket,
		                                                                     task,
		                                                                     firstVersion,
		                                                                     endVersion,
		                                                                     nextEndVersion,
		                                                                     TaskCompletionKey::noSignal(),
		                                                                     allPartsDone));

		wait(waitForAll(addTaskFutures));
		wait(taskBucket->finish(tr, task));

		TraceEvent("FileRestorePartitionDispatch")
		    .detail("RestoreUID", restore.getUid())
		    .detail("BeginVersion", beginVersion)
		    .detail("EndVersion", endVersion)
		    .detail("NextEndVersion", nextEndVersion)
		    .detail("ApplyLag", applyLag)
		    .detail("Decision", "dispatch_batch_complete")
		    .detail("TaskInstance", THIS_ADDR);

		return Void();
	}

	ACTOR static Future<Key> addTask(Reference<ReadYourWritesTransaction> tr,
	                                 Reference<TaskBucket> taskBucket,
	                                 Reference<Task> parentTask,
	                                 Version firstVersion,
	                                 Version beginVersion,
	                                 Version endVersion,
	                                 TaskCompletionKey completionKey = TaskCompletionKey::noSignal(),
	                                 Reference<TaskFuture> waitFor = Reference<TaskFuture>()) {
		Key doneKey = wait(completionKey.get(tr, taskBucket));

		// Use high priority for dispatch tasks that have to queue more blocks for the current batch
		unsigned int priority = 0;
		state Reference<Task> task(new Task(
		    RestoreDispatchPartitionedTaskFunc::name, RestoreDispatchPartitionedTaskFunc::version, doneKey, priority));

		// Create a config from the parent task and bind it to the new task
		wait(RestoreConfig(parentTask).toTask(tr, task));
		Params.firstVersion().set(task, firstVersion);
		Params.beginVersion().set(task, beginVersion);
		Params.endVersion().set(task, endVersion);

		if (!waitFor) {
			return taskBucket->addTask(tr, task);
		}

		wait(waitFor->onSetAddTask(tr, taskBucket, task));
		return "OnSetAddTask"_sr;
	}

	Future<Void> execute(Database cx,
	                     Reference<TaskBucket> tb,
	                     Reference<FutureBucket> fb,
	                     Reference<Task> task) override {
		return Void();
	};
	Future<Void> finish(Reference<ReadYourWritesTransaction> tr,
	                    Reference<TaskBucket> tb,
	                    Reference<FutureBucket> fb,
	                    Reference<Task> task) override {
		return _finish(tr, tb, fb, task);
	};
};
StringRef RestoreDispatchPartitionedTaskFunc::name = "restore_dispatch_partitioned"_sr;
REGISTER_TASKFUNC(RestoreDispatchPartitionedTaskFunc);

struct RestoreDispatchTaskFunc : RestoreTaskFuncBase {
	static StringRef name;
	static constexpr uint32_t version = 1;
	StringRef getName() const override { return name; };

	static struct {
		static TaskParam<Version> beginVersion() { return __FUNCTION__sr; }
		static TaskParam<std::string> beginFile() { return __FUNCTION__sr; }
		static TaskParam<int64_t> beginBlock() { return __FUNCTION__sr; }
		static TaskParam<int64_t> batchSize() { return __FUNCTION__sr; }
		static TaskParam<int64_t> remainingInBatch() { return __FUNCTION__sr; }
	} Params;

	ACTOR static Future<Void> _finish(Reference<ReadYourWritesTransaction> tr,
	                                  Reference<TaskBucket> taskBucket,
	                                  Reference<FutureBucket> futureBucket,
	                                  Reference<Task> task) {
		state RestoreConfig restore(task);

		state Version beginVersion = Params.beginVersion().get(task);
		state Reference<TaskFuture> onDone = futureBucket->unpack(task->params[Task::reservedTaskParamKeyDone]);

		state int64_t remainingInBatch = Params.remainingInBatch().get(task);
		state bool addingToExistingBatch = remainingInBatch > 0;
		state Version restoreVersion;

		wait(store(restoreVersion, restore.restoreVersion().getOrThrow(tr)) &&
		     checkTaskVersion(tr->getDatabase(), task, name, version));

		// If not adding to an existing batch then update the apply mutations end version so the mutations from the
		// previous batch can be applied.  Only do this once beginVersion is > 0 (it will be 0 for the initial
		// dispatch).
		if (!addingToExistingBatch && beginVersion > 0) {
			// unblock apply alog to normal key space
			// if the last file is [80, 100] and the restoreVersion is 90, we should use 90 here
			// this call an additional call after last file
			restore.setApplyEndVersion(tr, std::min(beginVersion, restoreVersion + 1));
		}

		// The applyLag must be retrieved AFTER potentially updating the apply end version.
		state int64_t applyLag = wait(restore.getApplyVersionLag(tr));
		state int64_t batchSize = Params.batchSize().get(task);

		// If starting a new batch and the apply lag is too large then re-queue and wait
		if (!addingToExistingBatch && applyLag > (BUGGIFY ? 1 : CLIENT_KNOBS->CORE_VERSIONSPERSECOND * 300)) {
			// Wait a small amount of time and then re-add this same task.
			wait(delay(FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY));
			wait(success(RestoreDispatchTaskFunc::addTask(
			    tr, taskBucket, task, beginVersion, "", 0, batchSize, remainingInBatch)));

			TraceEvent("FileRestoreDispatch")
			    .detail("RestoreUID", restore.getUid())
			    .detail("BeginVersion", beginVersion)
			    .detail("ApplyLag", applyLag)
			    .detail("BatchSize", batchSize)
			    .detail("Decision", "too_far_behind")
			    .detail("TaskInstance", THIS_ADDR);

			wait(taskBucket->finish(tr, task));
			return Void();
		}

		// need beginFile to handle stop in the middle of version case
		state std::string beginFile = Params.beginFile().getOrDefault(task);
		// Get a batch of files.  We're targeting batchSize blocks being dispatched so query for batchSize files
		// (each of which is 0 or more blocks).
		state int taskBatchSize = BUGGIFY ? 1 : CLIENT_KNOBS->RESTORE_DISPATCH_ADDTASK_SIZE;
		state RestoreConfig::FileSetT::RangeResultType files = wait(restore.fileSet().getRange(
		    tr, Optional<RestoreConfig::RestoreFile>({ beginVersion, beginFile }), {}, taskBatchSize));

		// allPartsDone will be set once all block tasks in the current batch are finished.
		state Reference<TaskFuture> allPartsDone;

		// If adding to existing batch then join the new block tasks to the existing batch future
		if (addingToExistingBatch) {
			Key fKey = wait(restore.batchFuture().getD(tr));
			allPartsDone = Reference<TaskFuture>(new TaskFuture(futureBucket, fKey));
		} else {
			// Otherwise create a new future for the new batch
			allPartsDone = futureBucket->future(tr);
			restore.batchFuture().set(tr, allPartsDone->pack());
			// Set batch quota remaining to batch size
			remainingInBatch = batchSize;
		}

		// If there were no files to load then this batch is done and restore is almost done.
		if (files.results.size() == 0) {
			// If adding to existing batch then blocks could be in progress so create a new Dispatch task that waits
			// for them to finish
			if (addingToExistingBatch) {
				// Setting next begin to restoreVersion + 1 so that any files in the file map at the restore version
				// won't be dispatched again.
				wait(success(RestoreDispatchTaskFunc::addTask(tr,
				                                              taskBucket,
				                                              task,
				                                              restoreVersion + 1,
				                                              "",
				                                              0,
				                                              batchSize,
				                                              0,
				                                              TaskCompletionKey::noSignal(),
				                                              allPartsDone)));

				TraceEvent("FileRestoreDispatch")
				    .detail("RestoreUID", restore.getUid())
				    .detail("BeginVersion", beginVersion)
				    .detail("BeginFile", Params.beginFile().get(task))
				    .detail("BeginBlock", Params.beginBlock().get(task))
				    .detail("RestoreVersion", restoreVersion)
				    .detail("ApplyLag", applyLag)
				    .detail("Decision", "end_of_final_batch")
				    .detail("TaskInstance", THIS_ADDR);
			} else if (beginVersion < restoreVersion) {
				// If beginVersion is less than restoreVersion then do one more dispatch task to get there
				// there are no more files between beginVersion and restoreVersion
				wait(success(RestoreDispatchTaskFunc::addTask(tr, taskBucket, task, restoreVersion, "", 0, batchSize)));

				TraceEvent("FileRestoreDispatch")
				    .detail("RestoreUID", restore.getUid())
				    .detail("BeginVersion", beginVersion)
				    .detail("BeginFile", Params.beginFile().get(task))
				    .detail("BeginBlock", Params.beginBlock().get(task))
				    .detail("RestoreVersion", restoreVersion)
				    .detail("ApplyLag", applyLag)
				    .detail("Decision", "apply_to_restore_version")
				    .detail("TaskInstance", THIS_ADDR);
			} else if (applyLag == 0) {
				// If apply lag is 0 then we are done so create the completion task
				wait(success(RestoreCompleteTaskFunc::addTask(tr, taskBucket, task, TaskCompletionKey::noSignal())));

				TraceEvent("FileRestoreDispatch")
				    .detail("RestoreUID", restore.getUid())
				    .detail("BeginVersion", beginVersion)
				    .detail("BeginFile", Params.beginFile().get(task))
				    .detail("BeginBlock", Params.beginBlock().get(task))
				    .detail("ApplyLag", applyLag)
				    .detail("Decision", "restore_complete")
				    .detail("TaskInstance", THIS_ADDR);
			} else {
				// Applying of mutations is not yet finished so wait a small amount of time and then re-add this
				// same task.
				// this is only to create a dummy one wait for it to finish
				wait(delay(FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY));
				wait(success(RestoreDispatchTaskFunc::addTask(tr, taskBucket, task, beginVersion, "", 0, batchSize)));

				TraceEvent("FileRestoreDispatch")
				    .detail("RestoreUID", restore.getUid())
				    .detail("BeginVersion", beginVersion)
				    .detail("ApplyLag", applyLag)
				    .detail("Decision", "apply_still_behind")
				    .detail("TaskInstance", THIS_ADDR);
			}

			// If adding to existing batch then task is joined with a batch future so set done future
			// Note that this must be done after joining at least one task with the batch future in case all other
			// blockers already finished.
			Future<Void> setDone = addingToExistingBatch ? onDone->set(tr, taskBucket) : Void();

			wait(taskBucket->finish(tr, task) && setDone);
			return Void();
		}

		// Start moving through the file list and queuing up blocks.  Only queue up to RESTORE_DISPATCH_ADDTASK_SIZE
		// blocks per Dispatch task and target batchSize total per batch but a batch must end on a complete version
		// boundary so exceed the limit if necessary to reach the end of a version of files.
		state std::vector<Future<Key>> addTaskFutures;
		state Version endVersion = files.results[0].version;
		state int blocksDispatched = 0;
		state int64_t beginBlock = Params.beginBlock().getOrDefault(task);
		state int i = 0;

		// for each file
		// not creating a new task at this level because restore files are read back together -- both range and log
		// so i have to process range files anyway.
		for (; i < files.results.size(); ++i) {
			RestoreConfig::RestoreFile& f = files.results[i];

			// Here we are "between versions" (prior to adding the first block of the first file of a new version)
			// so this is an opportunity to end the current dispatch batch (which must end on a version boundary) if
			// the batch size has been reached or exceeded
			if (f.version != endVersion && remainingInBatch <= 0) {
				// Next start will be at the first version after endVersion at the first file first block
				++endVersion;
				// beginFile set to empty to indicate we are not in the middle of a range
				// by middle of a range, we mean that we have rangeFile v=80, and logFile v=[80, 100],
				// then we have to include this log file too in this batch
				beginFile = "";
				beginBlock = 0;
				break;
			}

			// Set the starting point for the next task in case we stop inside this file
			endVersion = f.version;
			beginFile = f.fileName;

			state int64_t j = beginBlock * f.blockSize;
			// For each block of the file
			for (; j < f.fileSize; j += f.blockSize) {
				// Stop if we've reached the addtask limit
				if (blocksDispatched == taskBatchSize)
					break;

				if (f.isRange) {
					addTaskFutures.push_back(
					    RestoreRangeTaskFunc::addTask(tr,
					                                  taskBucket,
					                                  task,
					                                  f,
					                                  j,
					                                  std::min<int64_t>(f.blockSize, f.fileSize - j),
					                                  TaskCompletionKey::joinWith(allPartsDone)));
				} else {
					addTaskFutures.push_back(
					    RestoreLogDataTaskFunc::addTask(tr,
					                                    taskBucket,
					                                    task,
					                                    f,
					                                    j,
					                                    std::min<int64_t>(f.blockSize, f.fileSize - j),
					                                    TaskCompletionKey::joinWith(allPartsDone)));
				}

				// Increment beginBlock for the file and total blocks dispatched for this task
				++beginBlock;
				++blocksDispatched;
				--remainingInBatch;
			}

			// Stop if we've reached the addtask limit
			if (blocksDispatched == taskBatchSize)
				break;

			// We just completed an entire file so the next task should start at the file after this one within
			// endVersion (or later) if this iteration ends up being the last for this task
			beginFile = beginFile + '\x00';
			beginBlock = 0;

			TraceEvent("FileRestoreDispatchedFile")
			    .suppressFor(60)
			    .detail("RestoreUID", restore.getUid())
			    .detail("FileName", f.fileName)
			    .detail("TaskInstance", THIS_ADDR);
		}

		// If no blocks were dispatched then the next dispatch task should run now and be joined with the
		// allPartsDone future
		if (blocksDispatched == 0) {
			std::string decision;

			// If no files were dispatched either then the batch size wasn't large enough to catch all of the files
			// at the next lowest non-dispatched version, so increase the batch size.
			if (i == 0) {
				batchSize *= 2;
				decision = "increased_batch_size";
			} else
				decision = "all_files_were_empty";

			TraceEvent("FileRestoreDispatch")
			    .detail("RestoreUID", restore.getUid())
			    .detail("BeginVersion", beginVersion)
			    .detail("BeginFile", Params.beginFile().get(task))
			    .detail("BeginBlock", Params.beginBlock().get(task))
			    .detail("EndVersion", endVersion)
			    .detail("ApplyLag", applyLag)
			    .detail("BatchSize", batchSize)
			    .detail("Decision", decision)
			    .detail("TaskInstance", THIS_ADDR)
			    .detail("RemainingInBatch", remainingInBatch);

			wait(success(RestoreDispatchTaskFunc::addTask(tr,
			                                              taskBucket,
			                                              task,
			                                              endVersion,
			                                              beginFile,
			                                              beginBlock,
			                                              batchSize,
			                                              remainingInBatch,
			                                              TaskCompletionKey::joinWith((allPartsDone)))));

			// If adding to existing batch then task is joined with a batch future so set done future.
			// Note that this must be done after joining at least one task with the batch future in case all other
			// blockers already finished.
			Future<Void> setDone = addingToExistingBatch ? onDone->set(tr, taskBucket) : Void();

			wait(setDone && taskBucket->finish(tr, task));

			return Void();
		}

		// Increment the number of blocks dispatched in the restore config
		restore.filesBlocksDispatched().atomicOp(tr, blocksDispatched, MutationRef::Type::AddValue);

		// If beginFile is not empty then we had to stop in the middle of a version (possibly within a file) so we
		// cannot end the batch here because we do not know if we got all of the files and blocks from the last
		// version queued, so make sure remainingInBatch is at least 1.
		if (!beginFile.empty()) {
			// this is to make sure if we stop in the middle of a version, we do not end this batch
			// instead next RestoreDispatchTaskFunc should have addingToExistingBatch as true
			// thus they are considered the same batch and alog will be committed only when all of them succeed
			remainingInBatch = std::max<int64_t>(1, remainingInBatch);
		}

		// If more blocks need to be dispatched in this batch then add a follow-on task that is part of the
		// allPartsDone group which will won't wait to run and will add more block tasks.
		if (remainingInBatch > 0)
			addTaskFutures.push_back(RestoreDispatchTaskFunc::addTask(tr,
			                                                          taskBucket,
			                                                          task,
			                                                          endVersion,
			                                                          beginFile,
			                                                          beginBlock,
			                                                          batchSize,
			                                                          remainingInBatch,
			                                                          TaskCompletionKey::joinWith(allPartsDone)));
		else // Otherwise, add a follow-on task to continue after all previously dispatched blocks are done
			addTaskFutures.push_back(RestoreDispatchTaskFunc::addTask(tr,
			                                                          taskBucket,
			                                                          task,
			                                                          endVersion,
			                                                          beginFile,
			                                                          beginBlock,
			                                                          batchSize,
			                                                          0,
			                                                          TaskCompletionKey::noSignal(),
			                                                          allPartsDone));

		wait(waitForAll(addTaskFutures));

		// If adding to existing batch then task is joined with a batch future so set done future.
		Future<Void> setDone = addingToExistingBatch ? onDone->set(tr, taskBucket) : Void();

		wait(setDone && taskBucket->finish(tr, task));

		TraceEvent("FileRestoreDispatch")
		    .detail("RestoreUID", restore.getUid())
		    .detail("BeginVersion", beginVersion)
		    .detail("BeginFile", Params.beginFile().get(task))
		    .detail("BeginBlock", Params.beginBlock().get(task))
		    .detail("EndVersion", endVersion)
		    .detail("ApplyLag", applyLag)
		    .detail("BatchSize", batchSize)
		    .detail("Decision", "dispatched_files")
		    .detail("FilesDispatched", i)
		    .detail("BlocksDispatched", blocksDispatched)
		    .detail("TaskInstance", THIS_ADDR)
		    .detail("RemainingInBatch", remainingInBatch);

		return Void();
	}

	ACTOR static Future<Key> addTask(Reference<ReadYourWritesTransaction> tr,
	                                 Reference<TaskBucket> taskBucket,
	                                 Reference<Task> parentTask,
	                                 Version beginVersion,
	                                 std::string beginFile,
	                                 int64_t beginBlock,
	                                 int64_t batchSize,
	                                 int64_t remainingInBatch = 0,
	                                 TaskCompletionKey completionKey = TaskCompletionKey::noSignal(),
	                                 Reference<TaskFuture> waitFor = Reference<TaskFuture>()) {
		Key doneKey = wait(completionKey.get(tr, taskBucket));

		// Use high priority for dispatch tasks that have to queue more blocks for the current batch
		unsigned int priority = (remainingInBatch > 0) ? 1 : 0;
		state Reference<Task> task(
		    new Task(RestoreDispatchTaskFunc::name, RestoreDispatchTaskFunc::version, doneKey, priority));

		// Create a config from the parent task and bind it to the new task
		wait(RestoreConfig(parentTask).toTask(tr, task));
		Params.beginVersion().set(task, beginVersion);
		Params.batchSize().set(task, batchSize);
		Params.remainingInBatch().set(task, remainingInBatch);
		Params.beginBlock().set(task, beginBlock);
		Params.beginFile().set(task, beginFile);

		if (!waitFor) {
			return taskBucket->addTask(tr, task);
		}

		wait(waitFor->onSetAddTask(tr, taskBucket, task));
		return "OnSetAddTask"_sr;
	}

	Future<Void> execute(Database cx,
	                     Reference<TaskBucket> tb,
	                     Reference<FutureBucket> fb,
	                     Reference<Task> task) override {
		return Void();
	};
	Future<Void> finish(Reference<ReadYourWritesTransaction> tr,
	                    Reference<TaskBucket> tb,
	                    Reference<FutureBucket> fb,
	                    Reference<Task> task) override {
		return _finish(tr, tb, fb, task);
	};
};
StringRef RestoreDispatchTaskFunc::name = "restore_dispatch"_sr;
REGISTER_TASKFUNC(RestoreDispatchTaskFunc);

ACTOR Future<std::string> restoreStatus(Reference<ReadYourWritesTransaction> tr, Key tagName) {
	tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
	tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
	tr->setOption(FDBTransactionOptions::LOCK_AWARE);

	state std::vector<KeyBackedTag> tags;
	if (tagName.size() == 0) {
		std::vector<KeyBackedTag> t = wait(getAllRestoreTags(tr));
		tags = t;
	} else
		tags.push_back(makeRestoreTag(tagName.toString()));

	state std::string result;
	state int i = 0;

	for (; i < tags.size(); ++i) {
		UidAndAbortedFlagT u = wait(tags[i].getD(tr));
		std::string s = wait(RestoreConfig(u.first).getFullStatus(tr));
		result.append(s);
		result.append("\n\n");
	}

	return result;
}

ACTOR Future<ERestoreState> abortRestore(Reference<ReadYourWritesTransaction> tr, Key tagName) {
	tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
	tr->setOption(FDBTransactionOptions::LOCK_AWARE);
	tr->setOption(FDBTransactionOptions::COMMIT_ON_FIRST_PROXY);

	state KeyBackedTag tag = makeRestoreTag(tagName.toString());
	state Optional<UidAndAbortedFlagT> current = wait(tag.get(tr));
	if (!current.present())
		return ERestoreState::UNINITIALIZED;

	state RestoreConfig restore(current.get().first);

	state ERestoreState status = wait(restore.stateEnum().getD(tr));
	state bool runnable = wait(restore.isRunnable(tr));

	if (!runnable)
		return status;

	restore.stateEnum().set(tr, ERestoreState::ABORTED);

	// Clear all of the ApplyMutations stuff
	restore.clearApplyMutationsKeys(tr);

	// Cancel the backup tasks on this tag
	wait(tag.cancel(tr));

	bool bgRestoreEnabled = wait(restore.isBlobGranuleRestore().getD(tr));
	if (bgRestoreEnabled) {
		BlobGranuleRestoreConfig bgRestoreConfig;
		bgRestoreConfig.phase().set(tr, BlobRestorePhase::ERROR);
		bgRestoreConfig.phaseStartTs().set(tr, BlobRestorePhase::ERROR, now());
		bgRestoreConfig.error().set(tr, "Aborted");
	}

	wait(unlockDatabase(tr, current.get().first));
	return ERestoreState::ABORTED;
}

ACTOR Future<ERestoreState> abortRestore(Database cx, Key tagName) {
	state Reference<ReadYourWritesTransaction> tr =
	    Reference<ReadYourWritesTransaction>(new ReadYourWritesTransaction(cx));

	loop {
		try {
			ERestoreState estate = wait(abortRestore(tr, tagName));
			if (estate != ERestoreState::ABORTED) {
				return estate;
			}
			wait(tr->commit());
			break;
		} catch (Error& e) {
			wait(tr->onError(e));
		}
	}

	tr = Reference<ReadYourWritesTransaction>(new ReadYourWritesTransaction(cx));

	// Commit a dummy transaction before returning success, to ensure the mutation applier has stopped submitting
	// mutations
	loop {
		try {
			tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
			tr->setOption(FDBTransactionOptions::LOCK_AWARE);
			tr->setOption(FDBTransactionOptions::COMMIT_ON_FIRST_PROXY);
			tr->addReadConflictRange(singleKeyRange(KeyRef()));
			tr->addWriteConflictRange(singleKeyRange(KeyRef()));
			wait(tr->commit());
			return ERestoreState::ABORTED;
		} catch (Error& e) {
			wait(tr->onError(e));
		}
	}
}

struct StartFullRestoreTaskFunc : RestoreTaskFuncBase {
	static StringRef name;
	static constexpr uint32_t version = 1;

	static struct {
		static TaskParam<Version> firstVersion() { return __FUNCTION__sr; }
	} Params;

	// Find all files needed for the restore and save them in the RestoreConfig for the task.
	// Update the total number of files and blocks and change state to starting.
	ACTOR static Future<Void> _execute(Database cx,
	                                   Reference<TaskBucket> taskBucket,
	                                   Reference<FutureBucket> futureBucket,
	                                   Reference<Task> task) {
		state Reference<ReadYourWritesTransaction> tr(new ReadYourWritesTransaction(cx));
		state RestoreConfig restore(task);
		state Version restoreVersion;
		state Version beginVersion;
		state Reference<IBackupContainer> bc;
		state std::vector<KeyRange> ranges;
		state bool logsOnly;
		state bool inconsistentSnapshotOnly;
		state bool isBlobGranuleRestore;

		loop {
			try {
				tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
				tr->setOption(FDBTransactionOptions::LOCK_AWARE);

				wait(checkTaskVersion(tr->getDatabase(), task, name, version));
				wait(store(beginVersion, restore.beginVersion().getD(tr, Snapshot::False, ::invalidVersion)));

				wait(store(restoreVersion, restore.restoreVersion().getOrThrow(tr)));
				wait(store(ranges, restore.getRestoreRangesOrDefault(tr)));
				wait(store(logsOnly, restore.onlyApplyMutationLogs().getD(tr, Snapshot::False, false)));
				wait(store(inconsistentSnapshotOnly,
				           restore.inconsistentSnapshotOnly().getD(tr, Snapshot::False, false)));
				wait(store(isBlobGranuleRestore, restore.isBlobGranuleRestore().getD(tr, Snapshot::False, false)));
				wait(taskBucket->keepRunning(tr, task));

				ERestoreState oldState = wait(restore.stateEnum().getD(tr));
				if (oldState != ERestoreState::QUEUED && oldState != ERestoreState::STARTING) {
					wait(restore.logError(cx,
					                      restore_error(),
					                      format("StartFullRestore: Encountered unexpected state(%d)", oldState),
					                      THIS));
					return Void();
				}
				restore.stateEnum().set(tr, ERestoreState::STARTING);
				restore.fileSet().clear(tr);
				restore.fileBlockCount().clear(tr);
				restore.fileCount().clear(tr);
				Reference<IBackupContainer> _bc = wait(restore.sourceContainer().getOrThrow(tr));
				bc = getBackupContainerWithProxy(_bc);

				wait(tr->commit());
				break;
			} catch (Error& e) {
				wait(tr->onError(e));
			}
		}

		loop {
			if (!isBlobGranuleRestore) {
				break; // Skip this loop if it's not blob granule restore
			}
			CODE_PROBE(true, "Blob granule restore - wait data copy for completion");
			try {
				tr->reset();
				tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
				tr->setOption(FDBTransactionOptions::LOCK_AWARE);

				state BlobRestorePhase phase = wait(BlobGranuleRestoreConfig().phase().getD(tr));
				TraceEvent("BlobGranuleRestoreWaitPhase").detail("BlobRestorePhase", phase);
				if (phase == BlobRestorePhase::ERROR) {
					TraceEvent("BlobGranuleRestoreError").log();
					restore.stateEnum().set(tr, ERestoreState::ABORTED);
					wait(tr->commit());
					return Void();
				}
				if (phase != BlobRestorePhase::APPLYING_MLOGS) {
					wait(delay(CLIENT_KNOBS->BLOB_GRANULE_RESTORE_CHECK_INTERVAL));
				} else {
					Version version = wait(BlobGranuleRestoreConfig().beginVersion().getOrThrow(tr));
					beginVersion = version;
					restore.beginVersion().set(tr, beginVersion);
					TraceEvent("BlobGranuleRestoreResume").detail("BeginVersion", beginVersion);
					wait(tr->commit());
					if (restoreVersion <= beginVersion) {
						TraceEvent("BlobGranuleRestoreDone")
						    .detail("BeginVersion", beginVersion)
						    .detail("Target", restoreVersion);
						return Void();
					}
					break;
				}
			} catch (Error& e) {
				wait(tr->onError(e));
			}
		}

		tr->reset();
		loop {
			try {
				tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
				tr->setOption(FDBTransactionOptions::LOCK_AWARE);
				Version destVersion = wait(tr->getReadVersion());
				TraceEvent("FileRestoreVersionUpgrade")
				    .detail("RestoreVersion", restoreVersion)
				    .detail("Dest", destVersion);
				if (destVersion <= restoreVersion) {
					CODE_PROBE(true, "Forcing restored cluster to higher version");
					tr->set(minRequiredCommitVersionKey, BinaryWriter::toValue(restoreVersion + 1, Unversioned()));
					wait(tr->commit());
				} else {
					break;
				}
			} catch (Error& e) {
				wait(tr->onError(e));
			}
		}

		state Version firstConsistentVersion = invalidVersion;
		if (beginVersion == invalidVersion) {
			beginVersion = 0;
		}
		state Standalone<VectorRef<KeyRangeRef>> keyRangesFilter;
		for (auto const& r : ranges) {
			keyRangesFilter.push_back_deep(keyRangesFilter.arena(), KeyRangeRef(r));
		}
		state Optional<RestorableFileSet> restorable =
		    wait(bc->getRestoreSet(restoreVersion, keyRangesFilter, logsOnly, beginVersion));
		if (!restorable.present())
			throw restore_missing_data();

		// Convert the two lists in restorable (logs and ranges) to a single list of RestoreFiles.
		// Order does not matter, they will be put in order when written to the restoreFileMap below.
		state std::vector<RestoreConfig::RestoreFile> files;
		state std::vector<RestoreConfig::RestoreFile> logFiles;
		state std::vector<RestoreConfig::RestoreFile> rangeFiles;
		if (!logsOnly) {
			beginVersion = restorable.get().snapshot.beginVersion;

			if (!inconsistentSnapshotOnly) {
				for (const RangeFile& f : restorable.get().ranges) {
					files.push_back({ f.version, f.fileName, true, f.blockSize, f.fileSize });
					rangeFiles.push_back({ f.version, f.fileName, true, f.blockSize, f.fileSize });
					// In a restore with both snapshots and logs, the firstConsistentVersion is the highest version
					// of any range file.
					firstConsistentVersion = std::max(firstConsistentVersion, f.version);
				}
			} else {
				for (int i = 0; i < restorable.get().ranges.size(); ++i) {
					const RangeFile& f = restorable.get().ranges[i];
					files.push_back({ f.version, f.fileName, true, f.blockSize, f.fileSize });
					rangeFiles.push_back({ f.version, f.fileName, true, f.blockSize, f.fileSize });
					// In inconsistentSnapshotOnly mode, if all range files have the same version, then it is the
					// firstConsistentVersion, otherwise unknown (use -1).
					if (i != 0 && f.version != firstConsistentVersion) {
						firstConsistentVersion = invalidVersion;
					} else {
						firstConsistentVersion = f.version;
					}
				}
			}
		} else {
			// In logs-only (incremental) mode, the firstConsistentVersion should just be restore.beginVersion().
			firstConsistentVersion = beginVersion;
		}
		if (!inconsistentSnapshotOnly) {
			for (const LogFile& f : restorable.get().logs) {
				files.push_back(
				    { f.beginVersion, f.fileName, false, f.blockSize, f.fileSize, f.endVersion, f.tagId, f.totalTags });
				logFiles.push_back(
				    { f.beginVersion, f.fileName, false, f.blockSize, f.fileSize, f.endVersion, f.tagId, f.totalTags });
			}
		}
		// First version for which log data should be applied
		Params.firstVersion().set(task, beginVersion);

		tr->reset();
		loop {
			try {
				tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
				tr->setOption(FDBTransactionOptions::LOCK_AWARE);
				restore.firstConsistentVersion().set(tr, firstConsistentVersion);
				wait(tr->commit());
				break;
			} catch (Error& e) {
				wait(tr->onError(e));
			}
		}

		// add log files
		state std::vector<RestoreConfig::RestoreFile>::iterator logStart = logFiles.begin();
		state std::vector<RestoreConfig::RestoreFile>::iterator logEnd = logFiles.end();
		state int txBytes = 0;

		tr->reset();
		while (logStart != logEnd) {
			try {
				tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
				tr->setOption(FDBTransactionOptions::LOCK_AWARE);

				wait(taskBucket->keepRunning(tr, task));

				state std::vector<RestoreConfig::RestoreFile>::iterator logIt = logStart;

				txBytes = 0;
				state int logFileCount = 0;
				auto fileSet = restore.logFileSet();
				// TODO: split files into multiple keys, because files can be in the order of 10k or 100k, which
				// probably can't fit due to size limit for a value in FDB. as a result, fileSet has everything,
				// including [beginVersion, endVersion] for each tag
				for (; logIt != logEnd && txBytes < 1e6; ++logIt) {
					txBytes += fileSet.insert(tr, *logIt);
					++logFileCount;
				}
				wait(tr->commit());

				TraceEvent("FileRestoreLoadedLogFiles")
				    .detail("RestoreUID", restore.getUid())
				    .detail("FileCount", logFileCount)
				    .detail("TransactionBytes", txBytes)
				    .detail("TaskInstance", THIS_ADDR);

				logStart = logIt;
				tr->reset();
			} catch (Error& e) {
				wait(tr->onError(e));
			}
		}

		state std::vector<RestoreConfig::RestoreFile>::iterator rangeStart = rangeFiles.begin();
		state std::vector<RestoreConfig::RestoreFile>::iterator rangeEnd = rangeFiles.end();

		tr->reset();
		while (rangeStart != rangeEnd) {
			try {
				tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
				tr->setOption(FDBTransactionOptions::LOCK_AWARE);

				wait(taskBucket->keepRunning(tr, task));

				state std::vector<RestoreConfig::RestoreFile>::iterator rangeIt = rangeStart;

				txBytes = 0;
				state int rangeFileCount = 0;
				auto fileSet = restore.rangeFileSet();
				// as a result, fileSet has everything, including [beginVersion, endVersion] for each tag
				for (; rangeIt != rangeEnd && txBytes < 1e6; ++rangeIt) {
					txBytes += fileSet.insert(tr, *rangeIt);
					// handle the remaining
					++rangeFileCount;
				}
				wait(tr->commit());

				TraceEvent("FileRestoreLoadedRangeFiles")
				    .detail("RestoreUID", restore.getUid())
				    .detail("FileCount", rangeFileCount)
				    .detail("TransactionBytes", txBytes)
				    .detail("TaskInstance", THIS_ADDR);

				rangeStart = rangeIt;
				tr->reset();
			} catch (Error& e) {
				wait(tr->onError(e));
			}
		}

		// add files
		state std::vector<RestoreConfig::RestoreFile>::iterator start = files.begin();
		state std::vector<RestoreConfig::RestoreFile>::iterator end = files.end();

		tr->reset();
		while (start != end) {
			try {
				tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
				tr->setOption(FDBTransactionOptions::LOCK_AWARE);

				wait(taskBucket->keepRunning(tr, task));

				state std::vector<RestoreConfig::RestoreFile>::iterator it = start;

				txBytes = 0;
				state int nFileBlocks = 0;
				state int nFiles = 0;
				auto fileSet = restore.fileSet();
				// as a result, fileSet has everything, including [beginVersion, endVersion] for each tag
				for (; it != end && txBytes < 1e6; ++it) {
					txBytes += fileSet.insert(tr, *it);
					// handle the remaining
					nFileBlocks += (it->fileSize + it->blockSize - 1) / it->blockSize;
					++nFiles;
				}

				restore.fileCount().atomicOp(tr, nFiles, MutationRef::Type::AddValue);
				restore.fileBlockCount().atomicOp(tr, nFileBlocks, MutationRef::Type::AddValue);

				wait(tr->commit());

				TraceEvent("FileRestoreLoadedFiles")
				    .detail("RestoreUID", restore.getUid())
				    .detail("FileCount", nFiles)
				    .detail("FileBlockCount", nFileBlocks)
				    .detail("TransactionBytes", txBytes)
				    .detail("TaskInstance", THIS_ADDR);

				start = it;
				tr->reset();
			} catch (Error& e) {
				wait(tr->onError(e));
			}
		}

		return Void();
	}

	ACTOR static Future<Void> _finish(Reference<ReadYourWritesTransaction> tr,
	                                  Reference<TaskBucket> taskBucket,
	                                  Reference<FutureBucket> futureBucket,
	                                  Reference<Task> task) {
		state RestoreConfig restore(task);
		state bool transformPartitionedLog;
		state Version restoreVersion;
		state Version firstVersion = Params.firstVersion().getOrDefault(task, invalidVersion);
		state bool isBlobGranuleRestore;
		wait(store(isBlobGranuleRestore, restore.isBlobGranuleRestore().getD(tr, Snapshot::False, false)));

		if (firstVersion == invalidVersion) {
			if (isBlobGranuleRestore) {
				// For blob granule restore, we can complete the restore job if no mutation log is needed
				state Version beginVersion;
				wait(store(beginVersion, restore.beginVersion().getD(tr, Snapshot::False, ::invalidVersion)));
				wait(store(restoreVersion, restore.restoreVersion().getOrThrow(tr)));
				// no need to apply mutations if target version is less than begin version
				if (restoreVersion <= beginVersion) {
					wait(
					    success(RestoreCompleteTaskFunc::addTask(tr, taskBucket, task, TaskCompletionKey::noSignal())));
					wait(taskBucket->finish(tr, task));
					return Void();
				}
			}

			wait(restore.logError(
			    tr->getDatabase(), restore_missing_data(), "StartFullRestore: The backup had no data.", THIS));
			std::string tag = wait(restore.tag().getD(tr));
			wait(success(abortRestore(tr, StringRef(tag))));
			return Void();
		}

		restore.stateEnum().set(tr, ERestoreState::RUNNING);

		// Set applyMutation versions

		restore.setApplyBeginVersion(tr, firstVersion);
		restore.setApplyEndVersion(tr, firstVersion);

		// Apply range data and log data in order
		wait(store(transformPartitionedLog, restore.transformPartitionedLog().getD(tr, Snapshot::False, false)));
		wait(store(restoreVersion, restore.restoreVersion().getOrThrow(tr)));

		if (transformPartitionedLog) {
			Version endVersion =
			    std::min(firstVersion + CLIENT_KNOBS->RESTORE_PARTITIONED_BATCH_VERSION_SIZE, restoreVersion);
			wait(success(RestoreDispatchPartitionedTaskFunc::addTask(
			    tr, taskBucket, task, firstVersion, firstVersion, endVersion)));
		} else {
			wait(success(RestoreDispatchTaskFunc::addTask(
			    tr, taskBucket, task, 0, "", 0, CLIENT_KNOBS->RESTORE_DISPATCH_BATCH_SIZE)));
		}

		wait(taskBucket->finish(tr, task));

		// Initialize apply mutations map for non-blob-restore case. For blob restore, it's initialized
		// in blob migrator
		if (!isBlobGranuleRestore) {
			state Future<Optional<bool>> logsOnly = restore.onlyApplyMutationLogs().get(tr);
			wait(success(logsOnly));
			if (logsOnly.get().present() && logsOnly.get().get()) {
				//  If this is an incremental restore, we need to set the applyMutationsMapPrefix
				//  to the earliest log version so no mutations are missed
				Value versionEncoded = BinaryWriter::toValue(Params.firstVersion().get(task), Unversioned());
				wait(krmSetRange(tr, restore.applyMutationsMapPrefix(), normalKeys, versionEncoded));
			}
		}
		return Void();
	}

	ACTOR static Future<Key> addTask(Reference<ReadYourWritesTransaction> tr,
	                                 Reference<TaskBucket> taskBucket,
	                                 UID uid,
	                                 TaskCompletionKey completionKey,
	                                 Reference<TaskFuture> waitFor = Reference<TaskFuture>()) {
		tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
		tr->setOption(FDBTransactionOptions::LOCK_AWARE);

		Key doneKey = wait(completionKey.get(tr, taskBucket));
		state Reference<Task> task(
		    new Task(StartFullRestoreTaskFunc::name, StartFullRestoreTaskFunc::version, doneKey));

		state RestoreConfig restore(uid);
		// Bind the restore config to the new task
		wait(restore.toTask(tr, task));

		if (!waitFor) {
			return taskBucket->addTask(tr, task);
		}

		wait(waitFor->onSetAddTask(tr, taskBucket, task));
		return "OnSetAddTask"_sr;
	}

	StringRef getName() const override { return name; };

	Future<Void> execute(Database cx,
	                     Reference<TaskBucket> tb,
	                     Reference<FutureBucket> fb,
	                     Reference<Task> task) override {
		return _execute(cx, tb, fb, task);
	};
	Future<Void> finish(Reference<ReadYourWritesTransaction> tr,
	                    Reference<TaskBucket> tb,
	                    Reference<FutureBucket> fb,
	                    Reference<Task> task) override {
		return _finish(tr, tb, fb, task);
	};
};
StringRef StartFullRestoreTaskFunc::name = "restore_start"_sr;
REGISTER_TASKFUNC(StartFullRestoreTaskFunc);
} // namespace fileBackup

struct LogInfo : public ReferenceCounted<LogInfo> {
	std::string fileName;
	Reference<IAsyncFile> logFile;
	Version beginVersion;
	Version endVersion;
	int64_t offset;

	LogInfo() : offset(0) {}
};

class FileBackupAgentImpl {
public:
	static constexpr int MAX_RESTORABLE_FILE_METASECTION_BYTES = 1024 * 8;

	// Parallel restore
	ACTOR static Future<Void> parallelRestoreFinish(Database cx, UID randomUID, UnlockDB unlockDB = UnlockDB::True) {
		state ReadYourWritesTransaction tr(cx);
		state Optional<Value> restoreRequestDoneKeyValue;
		TraceEvent("FastRestoreToolWaitForRestoreToFinish").detail("DBLock", randomUID);
		// TODO: register watch first and then check if the key exist
		loop {
			try {
				tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
				tr.setOption(FDBTransactionOptions::LOCK_AWARE);
				Optional<Value> _restoreRequestDoneKeyValue = wait(tr.get(restoreRequestDoneKey));
				restoreRequestDoneKeyValue = _restoreRequestDoneKeyValue;
				// Restore may finish before restoreTool waits on the restore finish event.
				if (restoreRequestDoneKeyValue.present()) {
					break;
				} else {
					state Future<Void> watchForRestoreRequestDone = tr.watch(restoreRequestDoneKey);
					wait(tr.commit());
					wait(watchForRestoreRequestDone);
					break;
				}
			} catch (Error& e) {
				wait(tr.onError(e));
			}
		}

		TraceEvent("FastRestoreToolRestoreFinished")
		    .detail("ClearRestoreRequestDoneKey", restoreRequestDoneKeyValue.present());
		// Only this agent can clear the restoreRequestDoneKey
		wait(runRYWTransaction(cx, [](Reference<ReadYourWritesTransaction> tr) -> Future<Void> {
			tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
			tr->setOption(FDBTransactionOptions::LOCK_AWARE);
			tr->clear(restoreRequestDoneKey);
			return Void();
		}));

		if (unlockDB) {
			TraceEvent("FastRestoreToolRestoreFinished").detail("UnlockDBStart", randomUID);
			wait(unlockDatabase(cx, randomUID));
			TraceEvent("FastRestoreToolRestoreFinished").detail("UnlockDBFinish", randomUID);
		} else {
			TraceEvent("FastRestoreToolRestoreFinished").detail("DBLeftLockedAfterRestore", randomUID);
		}

		return Void();
	}

	ACTOR static Future<Void> submitParallelRestore(Database cx,
	                                                Key backupTag,
	                                                Standalone<VectorRef<KeyRangeRef>> backupRanges,
	                                                Key bcUrl,
	                                                Optional<std::string> proxy,
	                                                Version targetVersion,
	                                                LockDB lockDB,
	                                                UID randomUID,
	                                                Key addPrefix,
	                                                Key removePrefix) {
		// Sanity check backup is valid
		state Reference<IBackupContainer> bc = IBackupContainer::openContainer(bcUrl.toString(), proxy, {});
		state BackupDescription desc = wait(bc->describeBackup());
		wait(desc.resolveVersionTimes(cx));

		if (targetVersion == invalidVersion && desc.maxRestorableVersion.present()) {
			targetVersion = desc.maxRestorableVersion.get();
			TraceEvent(SevWarn, "FastRestoreSubmitRestoreRequestWithInvalidTargetVersion")
			    .detail("OverrideTargetVersion", targetVersion);
		}

		Optional<RestorableFileSet> restoreSet = wait(bc->getRestoreSet(targetVersion));

		if (!restoreSet.present()) {
			TraceEvent(SevWarn, "FileBackupAgentRestoreNotPossible")
			    .detail("BackupContainer", bc->getURL())
			    .detail("TargetVersion", targetVersion);
			throw restore_invalid_version();
		}

		TraceEvent("FastRestoreSubmitRestoreRequest")
		    .detail("BackupDesc", desc.toString())
		    .detail("TargetVersion", targetVersion);

		state Reference<ReadYourWritesTransaction> tr(new ReadYourWritesTransaction(cx));
		state int restoreIndex = 0;
		state int numTries = 0;
		// lock DB for restore
		loop {
			try {
				if (lockDB) {
					wait(lockDatabase(cx, randomUID));
				}
				wait(checkDatabaseLock(tr, randomUID));

				TraceEvent("FastRestoreToolSubmitRestoreRequests").detail("DBIsLocked", randomUID);
				break;
			} catch (Error& e) {
				TraceEvent(numTries > 50 ? SevError : SevInfo, "FastRestoreToolSubmitRestoreRequestsMayFail")
				    .error(e)
				    .detail("Reason", "DB is not properly locked")
				    .detail("ExpectedLockID", randomUID);
				numTries++;
				wait(tr->onError(e));
			}
		}

		// set up restore request
		tr->reset();
		numTries = 0;
		loop {
			tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
			tr->setOption(FDBTransactionOptions::LOCK_AWARE);
			try {
				// Note: we always lock DB here in case DB is modified at the bacupRanges boundary.
				for (restoreIndex = 0; restoreIndex < backupRanges.size(); restoreIndex++) {
					auto range = backupRanges[restoreIndex];
					Standalone<StringRef> restoreTag(backupTag.toString() + "_" + std::to_string(restoreIndex));
					// Register the request request in DB, which will be picked up by restore worker leader
					struct RestoreRequest restoreRequest(restoreIndex,
					                                     restoreTag,
					                                     bcUrl,
					                                     proxy,
					                                     targetVersion,
					                                     range,
					                                     deterministicRandom()->randomUniqueID(),
					                                     addPrefix,
					                                     removePrefix);
					tr->set(restoreRequestKeyFor(restoreRequest.index), restoreRequestValue(restoreRequest));
				}
				tr->set(restoreRequestTriggerKey,
				        restoreRequestTriggerValue(deterministicRandom()->randomUniqueID(), backupRanges.size()));
				wait(tr->commit()); // Trigger restore
				break;
			} catch (Error& e) {
				TraceEvent(numTries > 50 ? SevError : SevInfo, "FastRestoreToolSubmitRestoreRequestsRetry")
				    .error(e)
				    .detail("RestoreIndex", restoreIndex);
				numTries++;
				wait(tr->onError(e));
			}
		}
		return Void();
	}

	// This method will return the final status of the backup at tag, and return the URL that was used on the tag
	// when that status value was read.
	ACTOR static Future<EBackupState> waitBackup(FileBackupAgent* backupAgent,
	                                             Database cx,
	                                             std::string tagName,
	                                             StopWhenDone stopWhenDone,
	                                             Reference<IBackupContainer>* pContainer = nullptr,
	                                             UID* pUID = nullptr) {
		state std::string backTrace;
		state KeyBackedTag tag = makeBackupTag(tagName);

		loop {
			state Reference<ReadYourWritesTransaction> tr(new ReadYourWritesTransaction(cx));
			tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
			tr->setOption(FDBTransactionOptions::LOCK_AWARE);

			try {
				state Optional<UidAndAbortedFlagT> oldUidAndAborted = wait(tag.get(tr));
				if (!oldUidAndAborted.present()) {
					return EBackupState::STATE_NEVERRAN;
				}

				state BackupConfig config(oldUidAndAborted.get().first);
				state EBackupState status =
				    wait(config.stateEnum().getD(tr, Snapshot::False, EBackupState::STATE_NEVERRAN));

				// Break, if one of the following is true
				//  - no longer runnable
				//  - in differential mode (restorable) and stopWhenDone is not enabled
				if (!FileBackupAgent::isRunnable(status) ||
				    ((!stopWhenDone) && (EBackupState::STATE_RUNNING_DIFFERENTIAL == status))) {

					if (pContainer != nullptr) {
						Reference<IBackupContainer> c =
						    wait(config.backupContainer().getOrThrow(tr, Snapshot::False, backup_invalid_info()));
						*pContainer = fileBackup::getBackupContainerWithProxy(c);
					}

					if (pUID != nullptr) {
						*pUID = oldUidAndAborted.get().first;
					}

					return status;
				}

				state Future<Void> watchFuture = tr->watch(config.stateEnum().key);
				wait(tr->commit());
				wait(watchFuture);
			} catch (Error& e) {
				wait(tr->onError(e));
			}
		}
	}

	// TODO: Get rid of all of these confusing boolean flags
	ACTOR static Future<Void> submitBackup(FileBackupAgent* backupAgent,
	                                       Reference<ReadYourWritesTransaction> tr,
	                                       Key outContainer,
	                                       Optional<std::string> proxy,
	                                       int initialSnapshotIntervalSeconds,
	                                       int snapshotIntervalSeconds,
	                                       std::string tagName,
	                                       Standalone<VectorRef<KeyRangeRef>> backupRanges,
	                                       bool encryptionEnabled,
	                                       StopWhenDone stopWhenDone,
	                                       UsePartitionedLog partitionedLog,
	                                       IncrementalBackupOnly incrementalBackupOnly,
	                                       Optional<std::string> encryptionKeyFileName,
	                                       Optional<std::string> blobManifestUrl) {
		tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
		tr->setOption(FDBTransactionOptions::LOCK_AWARE);
		tr->setOption(FDBTransactionOptions::COMMIT_ON_FIRST_PROXY);

		TraceEvent(SevInfo, "FBA_SubmitBackup")
		    .detail("TagName", tagName.c_str())
		    .detail("StopWhenDone", stopWhenDone)
		    .detail("UsePartitionedLog", partitionedLog)
		    .detail("OutContainer", outContainer.toString());

		state KeyBackedTag tag = makeBackupTag(tagName);
		Optional<UidAndAbortedFlagT> uidAndAbortedFlag = wait(tag.get(tr));
		if (uidAndAbortedFlag.present()) {
			state BackupConfig prevConfig(uidAndAbortedFlag.get().first);
			state EBackupState prevBackupStatus =
			    wait(prevConfig.stateEnum().getD(tr, Snapshot::False, EBackupState::STATE_NEVERRAN));
			if (FileBackupAgent::isRunnable(prevBackupStatus)) {
				throw backup_duplicate();
			}

			// Now is time to clear prev backup config space. We have no more use for it.
			prevConfig.clear(tr);
		}

		state BackupConfig config(deterministicRandom()->randomUniqueID());
		state UID uid = config.getUid();

		// This check will ensure that current backupUid is later than the last backup Uid
		state Standalone<StringRef> nowStr = BackupAgentBase::getCurrentTime();
		state std::string backupContainer = outContainer.toString();

		// To be consistent with directory handling behavior since FDB backup was first released, if the container
		// string describes a local directory then "/backup-<timestamp>" will be added to it.
		if (backupContainer.find("file://") == 0) {
			backupContainer = joinPath(backupContainer, std::string("backup-") + nowStr.toString());
		}

		state Reference<IBackupContainer> bc =
		    IBackupContainer::openContainer(backupContainer, proxy, encryptionKeyFileName);
		try {
			wait(timeoutError(bc->create(), 30));
		} catch (Error& e) {
			if (e.code() == error_code_actor_cancelled)
				throw;
			fprintf(stderr, "ERROR: Could not create backup container: %s\n", e.what());
			throw backup_error();
		}

		Optional<Value> lastBackupTimestamp = wait(backupAgent->lastBackupTimestamp().get(tr));

		if ((lastBackupTimestamp.present()) && (lastBackupTimestamp.get() >= nowStr)) {
			fprintf(stderr,
			        "ERROR: The last backup `%s' happened in the future.\n",
			        printable(lastBackupTimestamp.get()).c_str());
			throw backup_error();
		}

		if (blobManifestUrl.present()) {
			state BlobGranuleBackupConfig bgBackupConfig;
			bool enabled = wait(bgBackupConfig.enabled().getD(tr));
			if (enabled) {
				fprintf(stderr, "ERROR: Abort existing blob manifest backup first before creating new one.\n");
				throw backup_error();
			} else {
				bgBackupConfig.manifestUrl().set(tr, blobManifestUrl.get());
				bgBackupConfig.mutationLogsUrl().set(tr, bc->getURL());
				bgBackupConfig.enabled().set(tr, true);
				bgBackupConfig.lastFlushTs().set(tr, 0);
			}
			// Allow only incremental backup
			incrementalBackupOnly = IncrementalBackupOnly::True;
			stopWhenDone = StopWhenDone::False;
		}

		KeyRangeMap<int> backupRangeSet;
		for (auto& backupRange : backupRanges) {
			backupRangeSet.insert(backupRange, 1);
		}

		backupRangeSet.coalesce(allKeys);
		state std::vector<KeyRange> normalizedRanges;

		for (auto& backupRange : backupRangeSet.ranges()) {
			if (backupRange.value()) {
				normalizedRanges.push_back(KeyRange(KeyRangeRef(backupRange.range().begin, backupRange.range().end)));
			}
		}

		config.clear(tr);

		state Key destUidValue(BinaryWriter::toValue(uid, Unversioned()));
		if (normalizedRanges.size() == 1 || isDefaultBackup(normalizedRanges)) {
			RangeResult existingDestUidValues = wait(
			    tr->getRange(KeyRangeRef(destUidLookupPrefix, strinc(destUidLookupPrefix)), CLIENT_KNOBS->TOO_MANY));
			bool found = false;
			KeyRangeRef targetRange =
			    normalizedRanges.size() == 1 ? normalizedRanges[0] : getDefaultBackupSharedRange();
			for (auto it : existingDestUidValues) {
				KeyRange uidRange =
				    BinaryReader::fromStringRef<KeyRange>(it.key.removePrefix(destUidLookupPrefix), IncludeVersion());
				if (uidRange == targetRange) {
					destUidValue = it.value;
					found = true;
					CODE_PROBE(targetRange == getDefaultBackupSharedRange(),
					           "Backup mutation sharing with default backup");
					break;
				}
			}
			if (!found) {
				destUidValue = BinaryWriter::toValue(deterministicRandom()->randomUniqueID(), Unversioned());
				tr->set(BinaryWriter::toValue(targetRange, IncludeVersion(ProtocolVersion::withSharedMutations()))
				            .withPrefix(destUidLookupPrefix),
				        destUidValue);
			}
		}

		tr->set(config.getUidAsKey().withPrefix(destUidValue).withPrefix(backupLatestVersionsPrefix),
		        BinaryWriter::toValue<Version>(tr->getReadVersion().get(), Unversioned()));
		config.destUidValue().set(tr, destUidValue);

		// Point the tag to this new uid
		tag.set(tr, { uid, false });

		backupAgent->lastBackupTimestamp().set(tr, nowStr);

		// Set the backup keys
		config.tag().set(tr, tagName);
		config.stateEnum().set(tr, EBackupState::STATE_SUBMITTED);
		config.backupContainer().set(tr, bc);
		config.stopWhenDone().set(tr, stopWhenDone);
		config.backupRanges().set(tr, normalizedRanges);
		config.initialSnapshotIntervalSeconds().set(tr, initialSnapshotIntervalSeconds);
		config.snapshotIntervalSeconds().set(tr, snapshotIntervalSeconds);
		config.partitionedLogEnabled().set(tr, partitionedLog);
		config.incrementalBackupOnly().set(tr, incrementalBackupOnly);
		config.enableSnapshotBackupEncryption().set(tr, encryptionEnabled);
		config.blobBackupEnabled().set(tr, blobManifestUrl.present());

		Key taskKey = wait(fileBackup::StartFullBackupTaskFunc::addTask(
		    tr, backupAgent->taskBucket, uid, TaskCompletionKey::noSignal()));

		return Void();
	}

	ACTOR static Future<Void> submitRestore(FileBackupAgent* backupAgent,
	                                        Reference<ReadYourWritesTransaction> tr,
	                                        Key tagName,
	                                        Key backupURL,
	                                        Optional<std::string> proxy,
	                                        Standalone<VectorRef<KeyRangeRef>> ranges,
	                                        Version restoreVersion,
	                                        Key addPrefix,
	                                        Key removePrefix,
	                                        LockDB lockDB,
	                                        UnlockDB unlockDB,
	                                        OnlyApplyMutationLogs onlyApplyMutationLogs,
	                                        InconsistentSnapshotOnly inconsistentSnapshotOnly,
	                                        Version beginVersion,
	                                        UID uid,
	                                        Optional<std::string> blobManifestUrl,
	                                        TransformPartitionedLog transformPartitionedLog) {
		KeyRangeMap<int> restoreRangeSet;
		for (auto& range : ranges) {
			restoreRangeSet.insert(range, 1);
		}
		restoreRangeSet.coalesce(allKeys);
		state std::vector<KeyRange> restoreRanges;
		for (auto& restoreRange : restoreRangeSet.ranges()) {
			if (restoreRange.value()) {
				restoreRanges.push_back(KeyRange(KeyRangeRef(restoreRange.range().begin, restoreRange.range().end)));
			}
		}
		for (auto& restoreRange : restoreRanges) {
			ASSERT(restoreRange.begin.startsWith(removePrefix) && restoreRange.end.startsWith(removePrefix));
		}

		tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
		tr->setOption(FDBTransactionOptions::LOCK_AWARE);

		// Get old restore config for this tag
		state KeyBackedTag tag = makeRestoreTag(tagName.toString());
		state Optional<UidAndAbortedFlagT> oldUidAndAborted = wait(tag.get(tr));
		if (oldUidAndAborted.present()) {
			if (oldUidAndAborted.get().first == uid) {
				if (oldUidAndAborted.get().second) {
					throw restore_duplicate_uid();
				} else {
					return Void();
				}
			}

			state RestoreConfig oldRestore(oldUidAndAborted.get().first);

			// Make sure old restore for this tag is not runnable
			bool runnable = wait(oldRestore.isRunnable(tr));

			if (runnable) {
				throw restore_duplicate_tag();
			}

			// Clear the old restore config
			oldRestore.clear(tr);
		}

		if (!onlyApplyMutationLogs || blobManifestUrl.present()) {
			state int index;
			for (index = 0; index < restoreRanges.size(); index++) {
				KeyRange restoreIntoRange = KeyRangeRef(restoreRanges[index].begin, restoreRanges[index].end)
				                                .removePrefix(removePrefix)
				                                .withPrefix(addPrefix);
				RangeResult existingRows = wait(tr->getRange(restoreIntoRange, 1));
				if (existingRows.size() > 0) {
					throw restore_destination_not_empty();
				}
			}
		}
		// Make new restore config
		state RestoreConfig restore(uid);

		// Point the tag to the new uid
		tag.set(tr, { uid, false });

		Reference<IBackupContainer> bc = IBackupContainer::openContainer(backupURL.toString(), proxy, {});

		// Configure the new restore
		restore.tag().set(tr, tagName.toString());
		restore.sourceContainer().set(tr, bc);
		restore.stateEnum().set(tr, ERestoreState::QUEUED);
		restore.restoreVersion().set(tr, restoreVersion);
		restore.onlyApplyMutationLogs().set(tr, onlyApplyMutationLogs);
		restore.inconsistentSnapshotOnly().set(tr, inconsistentSnapshotOnly);
		restore.beginVersion().set(tr, beginVersion);
		restore.unlockDBAfterRestore().set(tr, unlockDB);
		restore.transformPartitionedLog().set(tr, transformPartitionedLog);
		if (BUGGIFY && restoreRanges.size() == 1) {
			restore.restoreRange().set(tr, restoreRanges[0]);
		} else {
			for (auto& range : restoreRanges) {
				restore.restoreRangeSet().insert(tr, range);
			}
		}
		if (blobManifestUrl.present()) {
			state BlobGranuleRestoreConfig bgRestoreConfig;
			BlobRestorePhase phase = wait(bgRestoreConfig.phase().getD(tr, Snapshot::True, BlobRestorePhase::MAX));
			if (phase < BlobRestorePhase::DONE) {
				fprintf(stderr, "ERROR: Abort existing blob granules restore first before creating new one.\n");
				throw restore_error();
			} else {
				bgRestoreConfig.manifestUrl().set(tr, blobManifestUrl.get());
				bgRestoreConfig.mutationLogsUrl().set(tr, backupURL.toString());
				bgRestoreConfig.phase().set(tr, BlobRestorePhase::INIT);
				bgRestoreConfig.uid().set(tr, uid);
				bgRestoreConfig.targetVersion().set(tr, restoreVersion);
				bgRestoreConfig.phaseStartTs().set(tr, BlobRestorePhase::INIT, now());
				restore.isBlobGranuleRestore().set(tr, true);
			}
		}
		// this also sets restore.add/removePrefix.
		restore.initApplyMutations(tr, addPrefix, removePrefix, onlyApplyMutationLogs);

		Key taskKey = wait(fileBackup::StartFullRestoreTaskFunc::addTask(
		    tr, backupAgent->taskBucket, uid, TaskCompletionKey::noSignal()));

		if (lockDB)
			wait(lockDatabase(tr, uid));
		else
			wait(checkDatabaseLock(tr, uid));

		return Void();
	}

	// This method will return the final status of the backup
	ACTOR static Future<ERestoreState> waitRestore(Database cx, Key tagName, Verbose verbose) {
		state ERestoreState status;
		loop {
			state Reference<ReadYourWritesTransaction> tr(new ReadYourWritesTransaction(cx));
			try {
				tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
				tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
				tr->setOption(FDBTransactionOptions::LOCK_AWARE);

				state KeyBackedTag tag = makeRestoreTag(tagName.toString());
				Optional<UidAndAbortedFlagT> current = wait(tag.get(tr));
				if (!current.present()) {
					if (verbose)
						printf("waitRestore: Tag: %s  State: %s\n",
						       tagName.toString().c_str(),
						       FileBackupAgent::restoreStateText(ERestoreState::UNINITIALIZED).toString().c_str());
					return ERestoreState::UNINITIALIZED;
				}

				state RestoreConfig restore(current.get().first);

				if (verbose) {
					state std::string details = wait(restore.getProgress(tr));
					printf("%s\n", details.c_str());
				}

				ERestoreState status_ = wait(restore.stateEnum().getD(tr));
				status = status_;
				state bool runnable = wait(restore.isRunnable(tr));

				// State won't change from here
				if (!runnable)
					break;

				// Wait for a change
				state Future<Void> watchFuture = tr->watch(restore.stateEnum().key);
				wait(tr->commit());
				if (verbose)
					wait(watchFuture || delay(1));
				else
					wait(watchFuture);
			} catch (Error& e) {
				wait(tr->onError(e));
			}
		}

		return status;
	}

	ACTOR static Future<Void> discontinueBackup(FileBackupAgent* backupAgent,
	                                            Reference<ReadYourWritesTransaction> tr,
	                                            Key tagName) {
		tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
		tr->setOption(FDBTransactionOptions::LOCK_AWARE);

		state KeyBackedTag tag = makeBackupTag(tagName.toString());
		state UidAndAbortedFlagT current = wait(tag.getOrThrow(tr, Snapshot::False, backup_unneeded()));
		state BackupConfig config(current.first);
		state EBackupState status = wait(config.stateEnum().getD(tr, Snapshot::False, EBackupState::STATE_NEVERRAN));

		if (!FileBackupAgent::isRunnable(status)) {
			throw backup_unneeded();
		}

		// If the backup is already restorable then 'mostly' abort it - cancel all tasks via the tag
		// and clear the mutation logging config and data - but set its state as COMPLETED instead of ABORTED.
		state Optional<Version> latestRestorableVersion = wait(config.getLatestRestorableVersion(tr));

		TraceEvent(SevInfo, "FBA_DiscontinueBackup")
		    .detail("AlreadyRestorable", latestRestorableVersion.present() ? "Yes" : "No")
		    .detail("TagName", tag.tagName.c_str())
		    .detail("Status", BackupAgentBase::getStateText(status));

		if (latestRestorableVersion.present()) {
			// Cancel all backup tasks through tag
			wait(tag.cancel(tr));

			tr->setOption(FDBTransactionOptions::COMMIT_ON_FIRST_PROXY);

			state Key destUidValue = wait(config.destUidValue().getOrThrow(tr));
			wait(success(tr->getReadVersion()));
			wait(eraseLogData(tr, config.getUidAsKey(), destUidValue) &&
			     fileBackup::clearBackupStartID(tr, config.getUid()));

			config.stateEnum().set(tr, EBackupState::STATE_COMPLETED);

			return Void();
		}

		state bool stopWhenDone = wait(config.stopWhenDone().getOrThrow(tr));

		if (stopWhenDone) {
			throw backup_duplicate();
		}

		config.stopWhenDone().set(tr, true);

		return Void();
	}

	ACTOR static Future<Void> abortBackup(FileBackupAgent* backupAgent,
	                                      Reference<ReadYourWritesTransaction> tr,
	                                      std::string tagName) {
		tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
		tr->setOption(FDBTransactionOptions::LOCK_AWARE);

		state KeyBackedTag tag = makeBackupTag(tagName);
		state UidAndAbortedFlagT current = wait(tag.getOrThrow(tr, Snapshot::False, backup_unneeded()));

		state BackupConfig config(current.first);
		state Key destUidValue = wait(config.destUidValue().getOrThrow(tr));
		EBackupState status = wait(config.stateEnum().getD(tr, Snapshot::False, EBackupState::STATE_NEVERRAN));

		if (!backupAgent->isRunnable(status)) {
			throw backup_unneeded();
		}

		TraceEvent(SevInfo, "FBA_AbortBackup")
		    .detail("TagName", tagName.c_str())
		    .detail("Status", BackupAgentBase::getStateText(status));

		bool bgBackupEnabled = wait(config.blobBackupEnabled().getD(tr));
		if (bgBackupEnabled) {
			BlobGranuleBackupConfig bgbackupConfig;
			bgbackupConfig.enabled().set(tr, false);
			bgbackupConfig.lastFlushTs().set(tr, 0);
		}

		// Cancel backup task through tag
		wait(tag.cancel(tr));

		wait(eraseLogData(tr, config.getUidAsKey(), destUidValue) &&
		     fileBackup::clearBackupStartID(tr, config.getUid()));

		config.stateEnum().set(tr, EBackupState::STATE_ABORTED);

		return Void();
	}

	ACTOR static Future<Void> checkAndDisableBackupWorkers(Database cx) {
		state bool running = wait(runRYWTransaction(
		    cx, [=](Reference<ReadYourWritesTransaction> tr) { return anyPartitionedBackupRunning(tr); }));
		if (!running) {
			wait(disableBackupWorker(cx));
		}
		return Void();
	}

	ACTOR static Future<Void> changePause(FileBackupAgent* backupAgent, Database db, bool pause) {
		state Reference<ReadYourWritesTransaction> tr(new ReadYourWritesTransaction(db));

		loop {
			tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
			tr->setOption(FDBTransactionOptions::LOCK_AWARE);
			tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);

			try {
				// This is the pause key is in the task bucket, for backup agents
				backupAgent->taskBucket->changePause(tr, pause);
				// This is backup workers' pause key.
				tr->set(backupPausedKey, pause ? "1"_sr : "0"_sr);
				wait(tr->commit());
				break;
			} catch (Error& e) {
				wait(tr->onError(e));
			}
		}
		TraceEvent("FileBackupAgentChangePaused").detail("Action", pause ? "Paused" : "Resumed");
		return Void();
	}

	struct TimestampedVersion {
		Optional<Version> version;
		Optional<int64_t> epochs;

		bool present() const { return version.present(); }

		JsonBuilderObject toJSON() const {
			JsonBuilderObject doc;
			if (version.present()) {
				doc.setKey("Version", version.get());
				if (epochs.present()) {
					doc.setKey("EpochSeconds", epochs.get());
					doc.setKey("Timestamp", timeStampToString(epochs));
				}
			}
			return doc;
		}
	};

	// Helper actor for generating status
	// If f is present, lookup epochs using timekeeper and tr, return TimestampedVersion
	ACTOR static Future<TimestampedVersion> getTimestampedVersion(Reference<ReadYourWritesTransaction> tr,
	                                                              Future<Optional<Version>> f) {
		state TimestampedVersion tv;
		wait(store(tv.version, f));
		if (tv.version.present()) {
			wait(store(tv.epochs, timeKeeperEpochsFromVersion(tv.version.get(), tr)));
		}
		return tv;
	}

	ACTOR static Future<std::string> getStatusJSON(FileBackupAgent* backupAgent, Database cx, std::string tagName) {
		state Reference<ReadYourWritesTransaction> tr(new ReadYourWritesTransaction(cx));

		loop {
			try {
				state JsonBuilderObject doc;
				doc.setKey("SchemaVersion", "1.0.0");

				tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
				tr->setOption(FDBTransactionOptions::LOCK_AWARE);

				state KeyBackedTag tag = makeBackupTag(tagName);
				state Optional<UidAndAbortedFlagT> uidAndAbortedFlag;
				state Optional<Value> paused;
				state Version recentReadVersion;

				wait(store(paused, tr->get(backupAgent->taskBucket->getPauseKey())) &&
				     store(uidAndAbortedFlag, tag.get(tr)) && store(recentReadVersion, tr->getReadVersion()));

				doc.setKey("BackupAgentsPaused", paused.present());
				doc.setKey("Tag", tag.tagName);

				if (uidAndAbortedFlag.present()) {
					doc.setKey("UID", uidAndAbortedFlag.get().first.toString());

					state BackupConfig config(uidAndAbortedFlag.get().first);

					state EBackupState backupState =
					    wait(config.stateEnum().getD(tr, Snapshot::False, EBackupState::STATE_NEVERRAN));
					JsonBuilderObject statusDoc;
					statusDoc.setKey("Name", BackupAgentBase::getStateName(backupState));
					statusDoc.setKey("Description", BackupAgentBase::getStateText(backupState));
					statusDoc.setKey("Completed", backupState == EBackupState::STATE_COMPLETED);
					statusDoc.setKey("Running", BackupAgentBase::isRunnable(backupState));
					doc.setKey("Status", statusDoc);

					state Future<Void> done = Void();

					if (backupState != EBackupState::STATE_NEVERRAN) {
						state Reference<IBackupContainer> bc;
						state TimestampedVersion latestRestorable;

						wait(
						    store(latestRestorable, getTimestampedVersion(tr, config.getLatestRestorableVersion(tr))) &&
						    store(bc, config.backupContainer().getOrThrow(tr)));
						bc = fileBackup::getBackupContainerWithProxy(bc);

						doc.setKey("Restorable", latestRestorable.present());

						if (latestRestorable.present()) {
							JsonBuilderObject o = latestRestorable.toJSON();
							if (backupState != EBackupState::STATE_COMPLETED) {
								o.setKey("LagSeconds",
								         (recentReadVersion - latestRestorable.version.get()) /
								             CLIENT_KNOBS->CORE_VERSIONSPERSECOND);
							}
							doc.setKey("LatestRestorablePoint", o);
						}
						doc.setKey("DestinationURL", bc->getURL());
					}

					if (backupState == EBackupState::STATE_RUNNING_DIFFERENTIAL ||
					    backupState == EBackupState::STATE_RUNNING) {
						state int64_t snapshotInterval;
						state int64_t logBytesWritten;
						state int64_t rangeBytesWritten;
						state bool stopWhenDone;
						state TimestampedVersion snapshotBegin;
						state TimestampedVersion snapshotTargetEnd;
						state TimestampedVersion latestLogEnd;
						state TimestampedVersion latestSnapshotEnd;
						state TimestampedVersion snapshotLastDispatch;
						state Optional<int64_t> snapshotLastDispatchShardsBehind;

						wait(
						    store(snapshotInterval, config.snapshotIntervalSeconds().getOrThrow(tr)) &&
						    store(logBytesWritten, config.logBytesWritten().getD(tr)) &&
						    store(rangeBytesWritten, config.rangeBytesWritten().getD(tr)) &&
						    store(stopWhenDone, config.stopWhenDone().getOrThrow(tr)) &&
						    store(snapshotBegin, getTimestampedVersion(tr, config.snapshotBeginVersion().get(tr))) &&
						    store(snapshotTargetEnd,
						          getTimestampedVersion(tr, config.snapshotTargetEndVersion().get(tr))) &&
						    store(latestLogEnd, getTimestampedVersion(tr, config.latestLogEndVersion().get(tr))) &&
						    store(latestSnapshotEnd,
						          getTimestampedVersion(tr, config.latestSnapshotEndVersion().get(tr))) &&
						    store(snapshotLastDispatch,
						          getTimestampedVersion(tr, config.snapshotDispatchLastVersion().get(tr))) &&
						    store(snapshotLastDispatchShardsBehind, config.snapshotDispatchLastShardsBehind().get(tr)));

						doc.setKey("StopAfterSnapshot", stopWhenDone);
						doc.setKey("SnapshotIntervalSeconds", snapshotInterval);
						doc.setKey("LogBytesWritten", logBytesWritten);
						doc.setKey("RangeBytesWritten", rangeBytesWritten);

						if (latestLogEnd.present()) {
							doc.setKey("LatestLogEnd", latestLogEnd.toJSON());
						}

						if (latestSnapshotEnd.present()) {
							doc.setKey("LatestSnapshotEnd", latestSnapshotEnd.toJSON());
						}

						JsonBuilderObject snapshot;

						if (snapshotBegin.present()) {
							snapshot.setKey("Begin", snapshotBegin.toJSON());

							if (snapshotTargetEnd.present()) {
								snapshot.setKey("EndTarget", snapshotTargetEnd.toJSON());

								Version interval = snapshotTargetEnd.version.get() - snapshotBegin.version.get();
								snapshot.setKey("IntervalSeconds", interval / CLIENT_KNOBS->CORE_VERSIONSPERSECOND);

								Version elapsed = recentReadVersion - snapshotBegin.version.get();
								double progress = (interval > 0) ? (100.0 * elapsed / interval) : 100;
								snapshot.setKey("ExpectedProgress", progress);
							}

							JsonBuilderObject dispatchDoc = snapshotLastDispatch.toJSON();
							if (snapshotLastDispatchShardsBehind.present()) {
								dispatchDoc.setKey("ShardsBehind", snapshotLastDispatchShardsBehind.get());
							}
							snapshot.setKey("LastDispatch", dispatchDoc);
						}

						doc.setKey("CurrentSnapshot", snapshot);
					}

					KeyBackedMap<int64_t, std::pair<std::string, Version>>::RangeResultType errors =
					    wait(config.lastErrorPerType().getRange(
					        tr, 0, std::numeric_limits<int>::max(), CLIENT_KNOBS->TOO_MANY));
					JsonBuilderArray errorList;
					for (auto& e : errors.results) {
						std::string msg = e.second.first;
						Version ver = e.second.second;

						JsonBuilderObject errDoc;
						errDoc.setKey("Message", msg.c_str());
						errDoc.setKey("RelativeSeconds",
						              (ver - recentReadVersion) / CLIENT_KNOBS->CORE_VERSIONSPERSECOND);
					}
					doc.setKey("Errors", errorList);
				}

				return doc.getJson();
			} catch (Error& e) {
				wait(tr->onError(e));
			}
		}
	}

	ACTOR static Future<std::string> getStatus(FileBackupAgent* backupAgent,
	                                           Database cx,
	                                           ShowErrors showErrors,
	                                           std::string tagName) {
		state Reference<ReadYourWritesTransaction> tr(new ReadYourWritesTransaction(cx));
		state std::string statusText;

		loop {
			try {
				tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
				tr->setOption(FDBTransactionOptions::LOCK_AWARE);

				state KeyBackedTag tag;
				state BackupConfig config;
				state EBackupState backupState;

				statusText = "";
				tag = makeBackupTag(tagName);
				state Optional<UidAndAbortedFlagT> uidAndAbortedFlag = wait(tag.get(tr));
				state Future<Optional<Value>> fPaused = tr->get(backupAgent->taskBucket->getPauseKey());
				if (uidAndAbortedFlag.present()) {
					config = BackupConfig(uidAndAbortedFlag.get().first);
					EBackupState status =
					    wait(config.stateEnum().getD(tr, Snapshot::False, EBackupState::STATE_NEVERRAN));
					backupState = status;
				}

				if (!uidAndAbortedFlag.present() || backupState == EBackupState::STATE_NEVERRAN) {
					statusText += "No previous backups found.\n";
				} else {
					state std::string backupStatus(BackupAgentBase::getStateText(backupState));
					state Reference<IBackupContainer> bc;
					state Optional<Version> latestRestorableVersion;
					state Version recentReadVersion;

					wait(store(latestRestorableVersion, config.getLatestRestorableVersion(tr)) &&
					     store(bc, config.backupContainer().getOrThrow(tr)) &&
					     store(recentReadVersion, tr->getReadVersion()));
					bc = fileBackup::getBackupContainerWithProxy(bc);

					bool snapshotProgress = false;

					switch (backupState) {
					case EBackupState::STATE_SUBMITTED:
						statusText += "The backup on tag `" + tagName + "' is in progress (just started) to " +
						              bc->getURL() + ".\n";
						break;
					case EBackupState::STATE_RUNNING:
						statusText += "The backup on tag `" + tagName + "' is in progress to " + bc->getURL() + ".\n";
						snapshotProgress = true;
						break;
					case EBackupState::STATE_RUNNING_DIFFERENTIAL:
						statusText += "The backup on tag `" + tagName + "' is restorable but continuing to " +
						              bc->getURL() + ".\n";
						snapshotProgress = true;
						break;
					case EBackupState::STATE_COMPLETED:
						statusText += "The previous backup on tag `" + tagName + "' at " + bc->getURL() +
						              " completed at version " + format("%lld", latestRestorableVersion.orDefault(-1)) +
						              ".\n";
						break;
					default:
						statusText += "The previous backup on tag `" + tagName + "' at " + bc->getURL() + " " +
						              backupStatus + ".\n";
						break;
					}
					statusText += format("BackupUID: %s\n", uidAndAbortedFlag.get().first.toString().c_str());
					statusText += format("BackupURL: %s\n", bc->getURL().c_str());

					if (snapshotProgress) {
						state int64_t snapshotInterval;
						state Version snapshotBeginVersion;
						state Version snapshotTargetEndVersion;
						state Optional<Version> latestSnapshotEndVersion;
						state Optional<Version> latestLogEndVersion;
						state Optional<int64_t> logBytesWritten;
						state Optional<int64_t> rangeBytesWritten;
						state Optional<int64_t> latestSnapshotEndVersionTimestamp;
						state Optional<int64_t> latestLogEndVersionTimestamp;
						state Optional<int64_t> snapshotBeginVersionTimestamp;
						state Optional<int64_t> snapshotTargetEndVersionTimestamp;
						state bool stopWhenDone;

						wait(store(snapshotBeginVersion, config.snapshotBeginVersion().getOrThrow(tr)) &&
						     store(snapshotTargetEndVersion, config.snapshotTargetEndVersion().getOrThrow(tr)) &&
						     store(snapshotInterval, config.snapshotIntervalSeconds().getOrThrow(tr)) &&
						     store(logBytesWritten, config.logBytesWritten().get(tr)) &&
						     store(rangeBytesWritten, config.rangeBytesWritten().get(tr)) &&
						     store(latestLogEndVersion, config.latestLogEndVersion().get(tr)) &&
						     store(latestSnapshotEndVersion, config.latestSnapshotEndVersion().get(tr)) &&
						     store(stopWhenDone, config.stopWhenDone().getOrThrow(tr)));

						wait(store(latestSnapshotEndVersionTimestamp,
						           getTimestampFromVersion(latestSnapshotEndVersion, tr)) &&
						     store(latestLogEndVersionTimestamp, getTimestampFromVersion(latestLogEndVersion, tr)) &&
						     store(snapshotBeginVersionTimestamp,
						           timeKeeperEpochsFromVersion(snapshotBeginVersion, tr)) &&
						     store(snapshotTargetEndVersionTimestamp,
						           timeKeeperEpochsFromVersion(snapshotTargetEndVersion, tr)));

						statusText += format("Snapshot interval is %lld seconds.  ", snapshotInterval);
						if (backupState == EBackupState::STATE_RUNNING_DIFFERENTIAL)
							statusText += format("Current snapshot progress target is %3.2f%% (>100%% means the "
							                     "snapshot is supposed to be done)\n",
							                     100.0 * (recentReadVersion - snapshotBeginVersion) /
							                         (snapshotTargetEndVersion - snapshotBeginVersion));
						else
							statusText += "The initial snapshot is still running.\n";

						statusText += format("\nDetails:\n LogBytes written - %lld\n RangeBytes written - %lld\n "
						                     "Last complete log version and timestamp        - %s, %s\n "
						                     "Last complete snapshot version and timestamp   - %s, %s\n "
						                     "Current Snapshot start version and timestamp   - %s, %s\n "
						                     "Expected snapshot end version and timestamp    - %s, %s\n "
						                     "Backup supposed to stop at next snapshot completion - %s\n",
						                     logBytesWritten.orDefault(0),
						                     rangeBytesWritten.orDefault(0),
						                     versionToString(latestLogEndVersion).c_str(),
						                     timeStampToString(latestLogEndVersionTimestamp).c_str(),
						                     versionToString(latestSnapshotEndVersion).c_str(),
						                     timeStampToString(latestSnapshotEndVersionTimestamp).c_str(),
						                     versionToString(snapshotBeginVersion).c_str(),
						                     timeStampToString(snapshotBeginVersionTimestamp).c_str(),
						                     versionToString(snapshotTargetEndVersion).c_str(),
						                     timeStampToString(snapshotTargetEndVersionTimestamp).c_str(),
						                     boolToYesOrNo(stopWhenDone).c_str());
					}

					// Append the errors, if requested
					if (showErrors) {
						KeyBackedMap<int64_t, std::pair<std::string, Version>>::RangeResultType errors =
						    wait(config.lastErrorPerType().getRange(
						        tr, 0, std::numeric_limits<int>::max(), CLIENT_KNOBS->TOO_MANY));
						std::string recentErrors;
						std::string pastErrors;

						for (auto& e : errors.results) {
							Version v = e.second.second;
							std::string msg = format(
							    "%s ago : %s\n",
							    secondsToTimeFormat((recentReadVersion - v) / CLIENT_KNOBS->CORE_VERSIONSPERSECOND)
							        .c_str(),
							    e.second.first.c_str());

							// If error version is at or more recent than the latest restorable version then it
							// could be inhibiting progress
							if (v >= latestRestorableVersion.orDefault(0)) {
								recentErrors += msg;
							} else {
								pastErrors += msg;
							}
						}

						if (!recentErrors.empty()) {
							if (latestRestorableVersion.present())
								statusText +=
								    format("Recent Errors (since latest restorable point %s ago)\n",
								           secondsToTimeFormat((recentReadVersion - latestRestorableVersion.get()) /
								                               CLIENT_KNOBS->CORE_VERSIONSPERSECOND)
								               .c_str()) +
								    recentErrors;
							else
								statusText += "Recent Errors (since initialization)\n" + recentErrors;
						}
						if (!pastErrors.empty())
							statusText += "Older Errors\n" + pastErrors;
					}
				}

				Optional<Value> paused = wait(fPaused);
				if (paused.present()) {
					statusText += format("\nAll backup agents have been paused.\n");
				}

				break;
			} catch (Error& e) {
				wait(tr->onError(e));
			}
		}

		return statusText;
	}

	ACTOR static Future<Optional<Version>> getLastRestorable(FileBackupAgent* backupAgent,
	                                                         Reference<ReadYourWritesTransaction> tr,
	                                                         Key tagName,
	                                                         Snapshot snapshot) {
		tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
		tr->setOption(FDBTransactionOptions::LOCK_AWARE);
		state Optional<Value> version = wait(tr->get(backupAgent->lastRestorable.pack(tagName), snapshot));

		return (version.present())
		           ? Optional<Version>(BinaryReader::fromStringRef<Version>(version.get(), Unversioned()))
		           : Optional<Version>();
	}

	static StringRef read(StringRef& data, int bytes) {
		if (bytes > data.size())
			throw restore_error();
		StringRef r = data.substr(0, bytes);
		data = data.substr(bytes);
		return r;
	}

	// Submits the restore request to the database and throws "restore_invalid_version" error if
	// restore is not possible. Parameters:
	//   cx: the database to be restored to
	//   cxOrig: if present, is used to resolve the restore timestamp into a version.
	//   tagName: restore tag
	//   url: the backup container's URL that contains all backup files
	//   ranges: the restored key ranges; if empty, restore all key ranges in the backup
	//   waitForComplete: if set, wait until the restore is completed before returning; otherwise,
	//                    return when the request is submitted to the database.
	//   targetVersion: the version to be restored.
	//   verbose: print verbose information.
	//   addPrefix: each key is added this prefix during restore.
	//   removePrefix: for each key to be restored, remove this prefix first.
	//   lockDB: if set lock the database with randomUid before performing restore;
	//           otherwise, check database is locked with the randomUid
	//   onlyApplyMutationLogs: only perform incremental restore, by only applying mutation logs
	//   inconsistentSnapshotOnly: Ignore mutation log files during the restore to speedup the process.
	//                             When set to true, gives an inconsistent snapshot, thus not recommended
	//   beginVersions: restore's begin version for each range
	//   randomUid: the UID for lock the database
	ACTOR static Future<Version> restore(FileBackupAgent* backupAgent,
	                                     Database cx,
	                                     Optional<Database> cxOrig,
	                                     Key tagName,
	                                     Key url,
	                                     Optional<std::string> proxy,
	                                     Standalone<VectorRef<KeyRangeRef>> ranges,
	                                     Standalone<VectorRef<Version>> beginVersions,
	                                     WaitForComplete waitForComplete,
	                                     Version targetVersion,
	                                     Verbose verbose,
	                                     Key addPrefix,
	                                     Key removePrefix,
	                                     LockDB lockDB,
	                                     UnlockDB unlockDB,
	                                     OnlyApplyMutationLogs onlyApplyMutationLogs,
	                                     InconsistentSnapshotOnly inconsistentSnapshotOnly,
	                                     Optional<std::string> encryptionKeyFileName,
	                                     UID randomUid,
	                                     Optional<std::string> blobManifestUrl) {
		// The restore command line tool won't allow ranges to be empty, but correctness workloads somehow might.
		if (ranges.empty()) {
			throw restore_error();
		}

		state Reference<IBackupContainer> bc = IBackupContainer::openContainer(url.toString(), proxy, {});

		state BackupDescription desc = wait(bc->describeBackup(true));
		if (cxOrig.present()) {
			wait(desc.resolveVersionTimes(cxOrig.get()));
		}

		printf("Backup Description\n%s", desc.toString().c_str());
		if (targetVersion == invalidVersion && desc.maxRestorableVersion.present())
			targetVersion = desc.maxRestorableVersion.get();

		if (targetVersion == invalidVersion && onlyApplyMutationLogs && desc.contiguousLogEnd.present()) {
			targetVersion = desc.contiguousLogEnd.get() - 1;
		}
		if (blobManifestUrl.present()) {
			onlyApplyMutationLogs = OnlyApplyMutationLogs::True;
		}

		state Version beginVersion = invalidVersion; // min begin version for all ranges
		if (!beginVersions.empty()) {
			beginVersion = *std::min_element(beginVersions.begin(), beginVersions.end());
		}
		Optional<RestorableFileSet> restoreSet =
		    wait(bc->getRestoreSet(targetVersion, ranges, onlyApplyMutationLogs, beginVersion));

		// for blob granule restore, we don't have the begin version yet, so no need to check restore data set
		if (!restoreSet.present() && !blobManifestUrl.present()) {
			TraceEvent(SevWarn, "FileBackupAgentRestoreNotPossible")
			    .detail("BackupContainer", bc->getURL())
			    .detail("BeginVersion", beginVersion)
			    .detail("TargetVersion", targetVersion);
			throw restore_invalid_version();
		}

		if (verbose) {
			printf("Restoring backup to version: %lld\n", (long long)targetVersion);
		}

		state Reference<ReadYourWritesTransaction> tr(new ReadYourWritesTransaction(cx));
		loop {
			try {
				tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
				tr->setOption(FDBTransactionOptions::LOCK_AWARE);
				wait(submitRestore(backupAgent,
				                   tr,
				                   tagName,
				                   url,
				                   proxy,
				                   ranges,
				                   targetVersion,
				                   addPrefix,
				                   removePrefix,
				                   lockDB,
				                   unlockDB,
				                   onlyApplyMutationLogs,
				                   inconsistentSnapshotOnly,
				                   beginVersion,
				                   randomUid,
				                   blobManifestUrl,
				                   TransformPartitionedLog(desc.partitioned)));
				wait(tr->commit());
				break;
			} catch (Error& e) {
				if (e.code() == error_code_restore_duplicate_tag) {
					throw;
				}
				wait(tr->onError(e));
			}
		}

		if (waitForComplete) {
			ERestoreState finalState = wait(waitRestore(cx, tagName, verbose));
			if (finalState != ERestoreState::COMPLETED)
				throw restore_error();
		}

		return targetVersion;
	}

	// used for correctness only, locks the database before discontinuing the backup and that same lock is then used
	// while doing the restore. the tagname of the backup must be the same as the restore.
	ACTOR static Future<Version> atomicRestore(FileBackupAgent* backupAgent,
	                                           Database cx,
	                                           Key tagName,
	                                           Standalone<VectorRef<KeyRangeRef>> ranges,
	                                           Key addPrefix,
	                                           Key removePrefix,
	                                           UsePartitionedLog fastRestore) {
		state Reference<ReadYourWritesTransaction> ryw_tr =
		    Reference<ReadYourWritesTransaction>(new ReadYourWritesTransaction(cx));
		state BackupConfig backupConfig;
		state DatabaseConfiguration config = wait(getDatabaseConfiguration(cx));
		loop {
			try {
				ryw_tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
				ryw_tr->setOption(FDBTransactionOptions::LOCK_AWARE);
				state KeyBackedTag tag = makeBackupTag(tagName.toString());
				UidAndAbortedFlagT uidFlag = wait(tag.getOrThrow(ryw_tr));
				backupConfig = BackupConfig(uidFlag.first);
				state EBackupState status = wait(backupConfig.stateEnum().getOrThrow(ryw_tr));

				if (status != EBackupState::STATE_RUNNING_DIFFERENTIAL) {
					throw backup_duplicate();
				}

				break;
			} catch (Error& e) {
				wait(ryw_tr->onError(e));
			}
		}

		// Lock src, record commit version
		state Transaction tr(cx);
		state Version commitVersion;
		state UID randomUid = deterministicRandom()->randomUniqueID();
		loop {
			try {
				// We must get a commit version so add a conflict range that won't likely cause conflicts
				// but will ensure that the transaction is actually submitted.
				tr.addWriteConflictRange(backupConfig.snapshotRangeDispatchMap().subspace);
				wait(lockDatabase(&tr, randomUid));
				wait(tr.commit());
				commitVersion = tr.getCommittedVersion();
				TraceEvent("AS_Locked").detail("CommitVer", commitVersion);
				break;
			} catch (Error& e) {
				wait(tr.onError(e));
			}
		}

		ryw_tr->reset();
		loop {
			try {
				Optional<Version> restoreVersion = wait(backupConfig.getLatestRestorableVersion(ryw_tr));
				if (restoreVersion.present() && restoreVersion.get() >= commitVersion) {
					TraceEvent("AS_RestoreVersion").detail("RestoreVer", restoreVersion.get());
					break;
				} else {
					ryw_tr->reset();
					wait(delay(0.2));
				}
			} catch (Error& e) {
				wait(ryw_tr->onError(e));
			}
		}

		ryw_tr->reset();
		loop {
			try {
				wait(discontinueBackup(backupAgent, ryw_tr, tagName));
				wait(ryw_tr->commit());
				TraceEvent("AS_DiscontinuedBackup").log();
				break;
			} catch (Error& e) {
				if (e.code() == error_code_backup_unneeded || e.code() == error_code_backup_duplicate) {
					break;
				}
				wait(ryw_tr->onError(e));
			}
		}

		wait(success(waitBackup(backupAgent, cx, tagName.toString(), StopWhenDone::True)));
		TraceEvent("AS_BackupStopped").log();

		ryw_tr->reset();
		loop {

			try {
				ryw_tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
				ryw_tr->setOption(FDBTransactionOptions::LOCK_AWARE);
				for (auto& range : ranges) {
					ryw_tr->addReadConflictRange(range);
					ryw_tr->clear(range);
				}
				wait(ryw_tr->commit());
				TraceEvent("AS_ClearedRange").log();
				break;
			} catch (Error& e) {
				wait(ryw_tr->onError(e));
			}
		}

		state Reference<IBackupContainer> bc = wait(backupConfig.backupContainer().getOrThrow(cx.getReference()));
		bc = fileBackup::getBackupContainerWithProxy(bc);

		if (fastRestore) {
			TraceEvent("AtomicParallelRestoreStartRestore").log();
			Version targetVersion = ::invalidVersion;
			wait(submitParallelRestore(cx,
			                           tagName,
			                           ranges,
			                           KeyRef(bc->getURL()),
			                           bc->getProxy(),
			                           targetVersion,
			                           LockDB::True,
			                           randomUid,
			                           addPrefix,
			                           removePrefix));
			state bool hasPrefix = (addPrefix.size() > 0 || removePrefix.size() > 0);
			TraceEvent("AtomicParallelRestoreWaitForRestoreFinish").detail("HasPrefix", hasPrefix);
			wait(parallelRestoreFinish(cx, randomUid, UnlockDB{ !hasPrefix }));
			// If addPrefix or removePrefix set, we want to transform the effect by copying data
			if (hasPrefix) {
				wait(transformRestoredDatabase(cx, ranges, addPrefix, removePrefix));
				wait(unlockDatabase(cx, randomUid));
			}
			return -1;
		} else {
			TraceEvent("AS_StartRestore").log();
			state Standalone<VectorRef<KeyRangeRef>> restoreRange;
			state Standalone<VectorRef<KeyRangeRef>> systemRestoreRange;
			for (auto r : ranges) {
				if (config.tenantMode != TenantMode::REQUIRED || !r.intersects(getSystemBackupRanges())) {
					restoreRange.push_back_deep(restoreRange.arena(), r);
				} else {
					KeyRangeRef normalKeyRange = r & normalKeys;
					KeyRangeRef systemKeyRange = r & systemKeys;
					if (!normalKeyRange.empty()) {
						restoreRange.push_back_deep(restoreRange.arena(), normalKeyRange);
					}
					if (!systemKeyRange.empty()) {
						systemRestoreRange.push_back_deep(systemRestoreRange.arena(), systemKeyRange);
					}
				}
			}
			if (!systemRestoreRange.empty()) {
				// restore system keys
				wait(success(restore(backupAgent,
				                     cx,
				                     cx,
				                     "system_restore"_sr,
				                     KeyRef(bc->getURL()),
				                     bc->getProxy(),
				                     systemRestoreRange,
				                     {},
				                     WaitForComplete::True,
				                     ::invalidVersion,
				                     Verbose::True,
				                     addPrefix,
				                     removePrefix,
				                     LockDB::True,
				                     UnlockDB::False,
				                     OnlyApplyMutationLogs::False,
				                     InconsistentSnapshotOnly::False,
				                     {},
				                     randomUid,
				                     {})));
				state Reference<ReadYourWritesTransaction> rywTransaction =
				    Reference<ReadYourWritesTransaction>(new ReadYourWritesTransaction(cx));
				// clear old restore config associated with system keys
				loop {
					try {
						rywTransaction->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
						rywTransaction->setOption(FDBTransactionOptions::LOCK_AWARE);
						state RestoreConfig oldRestore(randomUid);
						oldRestore.clear(rywTransaction);
						wait(rywTransaction->commit());
						break;
					} catch (Error& e) {
						wait(rywTransaction->onError(e));
					}
				}
			}
			// restore user data
			state Version ver = wait(restore(backupAgent,
			                                 cx,
			                                 cx,
			                                 tagName,
			                                 KeyRef(bc->getURL()),
			                                 bc->getProxy(),
			                                 restoreRange,
			                                 {},
			                                 WaitForComplete::True,
			                                 ::invalidVersion,
			                                 Verbose::True,
			                                 addPrefix,
			                                 removePrefix,
			                                 LockDB::True,
			                                 UnlockDB::True,
			                                 OnlyApplyMutationLogs::False,
			                                 InconsistentSnapshotOnly::False,
			                                 {},
			                                 randomUid,
			                                 {}));
			return ver;
		}
	}

	// Similar to atomicRestore, only used in simulation test.
	// locks the database before discontinuing the backup and that same lock is then used while doing the restore.
	// the tagname of the backup must be the same as the restore.
	static Future<Void> atomicParallelRestore(FileBackupAgent* backupAgent,
	                                          Database cx,
	                                          Key tagName,
	                                          Standalone<VectorRef<KeyRangeRef>> ranges,
	                                          Key addPrefix,
	                                          Key removePrefix) {
		return success(
		    atomicRestore(backupAgent, cx, tagName, ranges, addPrefix, removePrefix, UsePartitionedLog::True));
	}
};

const int FileBackupAgent::dataFooterSize = 20;

// Return if parallel restore has finished
Future<Void> FileBackupAgent::parallelRestoreFinish(Database cx, UID randomUID, UnlockDB unlockDB) {
	return FileBackupAgentImpl::parallelRestoreFinish(cx, randomUID, unlockDB);
}

Future<Void> FileBackupAgent::submitParallelRestore(Database cx,
                                                    Key backupTag,
                                                    Standalone<VectorRef<KeyRangeRef>> backupRanges,
                                                    Key bcUrl,
                                                    Optional<std::string> proxy,
                                                    Version targetVersion,
                                                    LockDB lockDB,
                                                    UID randomUID,
                                                    Key addPrefix,
                                                    Key removePrefix) {
	return FileBackupAgentImpl::submitParallelRestore(
	    cx, backupTag, backupRanges, bcUrl, proxy, targetVersion, lockDB, randomUID, addPrefix, removePrefix);
}

Future<Void> FileBackupAgent::atomicParallelRestore(Database cx,
                                                    Key tagName,
                                                    Standalone<VectorRef<KeyRangeRef>> ranges,
                                                    Key addPrefix,
                                                    Key removePrefix) {
	return FileBackupAgentImpl::atomicParallelRestore(this, cx, tagName, ranges, addPrefix, removePrefix);
}

Future<Version> FileBackupAgent::restore(Database cx,
                                         Optional<Database> cxOrig,
                                         Key tagName,
                                         Key url,
                                         Optional<std::string> proxy,
                                         Standalone<VectorRef<KeyRangeRef>> ranges,
                                         Standalone<VectorRef<Version>> versions,
                                         WaitForComplete waitForComplete,
                                         Version targetVersion,
                                         Verbose verbose,
                                         Key addPrefix,
                                         Key removePrefix,
                                         LockDB lockDB,
                                         UnlockDB unlockDB,
                                         OnlyApplyMutationLogs onlyApplyMutationLogs,
                                         InconsistentSnapshotOnly inconsistentSnapshotOnly,
                                         Optional<std::string> const& encryptionKeyFileName,
                                         Optional<std::string> blobManifestUrl) {
	return FileBackupAgentImpl::restore(this,
	                                    cx,
	                                    cxOrig,
	                                    tagName,
	                                    url,
	                                    proxy,
	                                    ranges,
	                                    versions,
	                                    waitForComplete,
	                                    targetVersion,
	                                    verbose,
	                                    addPrefix,
	                                    removePrefix,
	                                    lockDB,
	                                    unlockDB,
	                                    onlyApplyMutationLogs,
	                                    inconsistentSnapshotOnly,
	                                    encryptionKeyFileName,
	                                    deterministicRandom()->randomUniqueID(),
	                                    blobManifestUrl);
}

Future<Version> FileBackupAgent::restore(Database cx,
                                         Optional<Database> cxOrig,
                                         Key tagName,
                                         Key url,
                                         Optional<std::string> proxy,
                                         Standalone<VectorRef<KeyRangeRef>> ranges,
                                         WaitForComplete waitForComplete,
                                         Version targetVersion,
                                         Verbose verbose,
                                         Key addPrefix,
                                         Key removePrefix,
                                         LockDB lockDB,
                                         UnlockDB unlockDB,
                                         OnlyApplyMutationLogs onlyApplyMutationLogs,
                                         InconsistentSnapshotOnly inconsistentSnapshotOnly,
                                         Version beginVersion,
                                         Optional<std::string> const& encryptionKeyFileName,
                                         Optional<std::string> blobManifestUrl) {
	Standalone<VectorRef<Version>> beginVersions;
	for (auto i = 0; i < ranges.size(); ++i) {
		beginVersions.push_back(beginVersions.arena(), beginVersion);
	}
	return restore(cx,
	               cxOrig,
	               tagName,
	               url,
	               proxy,
	               ranges,
	               beginVersions,
	               waitForComplete,
	               targetVersion,
	               verbose,
	               addPrefix,
	               removePrefix,
	               lockDB,
	               unlockDB,
	               onlyApplyMutationLogs,
	               inconsistentSnapshotOnly,
	               encryptionKeyFileName,
	               blobManifestUrl);
}

Future<Version> FileBackupAgent::restore(Database cx,
                                         Optional<Database> cxOrig,
                                         Key tagName,
                                         Key url,
                                         Optional<std::string> proxy,
                                         WaitForComplete waitForComplete,
                                         Version targetVersion,
                                         Verbose verbose,
                                         KeyRange range,
                                         Key addPrefix,
                                         Key removePrefix,
                                         LockDB lockDB,
                                         OnlyApplyMutationLogs onlyApplyMutationLogs,
                                         InconsistentSnapshotOnly inconsistentSnapshotOnly,
                                         Version beginVersion,
                                         Optional<std::string> const& encryptionKeyFileName,
                                         Optional<std::string> blobManifestUrl) {
	Standalone<VectorRef<KeyRangeRef>> rangeRef;
	if (range.begin.empty() && range.end.empty()) {
		addDefaultBackupRanges(rangeRef);
	} else {
		rangeRef.push_back_deep(rangeRef.arena(), range);
	}
	Standalone<VectorRef<Version>> versionRef;
	versionRef.push_back(versionRef.arena(), beginVersion);

	return restore(cx,
	               cxOrig,
	               tagName,
	               url,
	               proxy,
	               rangeRef,
	               versionRef,
	               waitForComplete,
	               targetVersion,
	               verbose,
	               addPrefix,
	               removePrefix,
	               lockDB,
	               UnlockDB::True,
	               onlyApplyMutationLogs,
	               inconsistentSnapshotOnly,
	               encryptionKeyFileName,
	               blobManifestUrl);
}

Future<Version> FileBackupAgent::atomicRestore(Database cx,
                                               Key tagName,
                                               KeyRange range,
                                               Key addPrefix,
                                               Key removePrefix) {
	Standalone<VectorRef<KeyRangeRef>> rangeRef;
	if (range.begin.empty() && range.end.empty()) {
		addDefaultBackupRanges(rangeRef);
	} else {
		rangeRef.push_back_deep(rangeRef.arena(), range);
	}
	return atomicRestore(cx, tagName, rangeRef, addPrefix, removePrefix);
}

Future<Version> FileBackupAgent::atomicRestore(Database cx,
                                               Key tagName,
                                               Standalone<VectorRef<KeyRangeRef>> ranges,
                                               Key addPrefix,
                                               Key removePrefix) {
	return FileBackupAgentImpl::atomicRestore(
	    this, cx, tagName, ranges, addPrefix, removePrefix, UsePartitionedLog::False);
}

Future<ERestoreState> FileBackupAgent::abortRestore(Reference<ReadYourWritesTransaction> tr, Key tagName) {
	return fileBackup::abortRestore(tr, tagName);
}

Future<ERestoreState> FileBackupAgent::abortRestore(Database cx, Key tagName) {
	return fileBackup::abortRestore(cx, tagName);
}

Future<std::string> FileBackupAgent::restoreStatus(Reference<ReadYourWritesTransaction> tr, Key tagName) {
	return fileBackup::restoreStatus(tr, tagName);
}

Future<ERestoreState> FileBackupAgent::waitRestore(Database cx, Key tagName, Verbose verbose) {
	return FileBackupAgentImpl::waitRestore(cx, tagName, verbose);
};

Future<Void> FileBackupAgent::submitBackup(Reference<ReadYourWritesTransaction> tr,
                                           Key outContainer,
                                           Optional<std::string> proxy,
                                           int initialSnapshotIntervalSeconds,
                                           int snapshotIntervalSeconds,
                                           std::string const& tagName,
                                           Standalone<VectorRef<KeyRangeRef>> backupRanges,
                                           bool encryptionEnabled,
                                           StopWhenDone stopWhenDone,
                                           UsePartitionedLog partitionedLog,
                                           IncrementalBackupOnly incrementalBackupOnly,
                                           Optional<std::string> const& encryptionKeyFileName,
                                           Optional<std::string> const& blobManifestUrl) {
	return FileBackupAgentImpl::submitBackup(this,
	                                         tr,
	                                         outContainer,
	                                         proxy,
	                                         initialSnapshotIntervalSeconds,
	                                         snapshotIntervalSeconds,
	                                         tagName,
	                                         backupRanges,
	                                         encryptionEnabled,
	                                         stopWhenDone,
	                                         partitionedLog,
	                                         incrementalBackupOnly,
	                                         encryptionKeyFileName,
	                                         blobManifestUrl);
}

Future<Void> FileBackupAgent::discontinueBackup(Reference<ReadYourWritesTransaction> tr, Key tagName) {
	return FileBackupAgentImpl::discontinueBackup(this, tr, tagName);
}

Future<Void> FileBackupAgent::abortBackup(Reference<ReadYourWritesTransaction> tr, std::string tagName) {
	return FileBackupAgentImpl::abortBackup(this, tr, tagName);
}

Future<Void> FileBackupAgent::checkAndDisableBackupWorkers(Database cx) {
	return FileBackupAgentImpl::checkAndDisableBackupWorkers(cx);
}

Future<std::string> FileBackupAgent::getStatus(Database cx, ShowErrors showErrors, std::string tagName) {
	return FileBackupAgentImpl::getStatus(this, cx, showErrors, tagName);
}

Future<std::string> FileBackupAgent::getStatusJSON(Database cx, std::string tagName) {
	return FileBackupAgentImpl::getStatusJSON(this, cx, tagName);
}

Future<Optional<Version>> FileBackupAgent::getLastRestorable(Reference<ReadYourWritesTransaction> tr,
                                                             Key tagName,
                                                             Snapshot snapshot) {
	return FileBackupAgentImpl::getLastRestorable(this, tr, tagName, snapshot);
}

void FileBackupAgent::setLastRestorable(Reference<ReadYourWritesTransaction> tr, Key tagName, Version version) {
	tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
	tr->setOption(FDBTransactionOptions::LOCK_AWARE);
	tr->set(lastRestorable.pack(tagName), BinaryWriter::toValue<Version>(version, Unversioned()));
}

Future<EBackupState> FileBackupAgent::waitBackup(Database cx,
                                                 std::string tagName,
                                                 StopWhenDone stopWhenDone,
                                                 Reference<IBackupContainer>* pContainer,
                                                 UID* pUID) {
	return FileBackupAgentImpl::waitBackup(this, cx, tagName, stopWhenDone, pContainer, pUID);
}

Future<Void> FileBackupAgent::changePause(Database db, bool pause) {
	return FileBackupAgentImpl::changePause(this, db, pause);
}

// Fast Restore addPrefix test helper functions
static std::pair<bool, bool> insideValidRange(KeyValueRef kv,
                                              Standalone<VectorRef<KeyRangeRef>> restoreRanges,
                                              Standalone<VectorRef<KeyRangeRef>> backupRanges) {
	bool insideRestoreRange = false;
	bool insideBackupRange = false;
	for (auto& range : restoreRanges) {
		TraceEvent(SevFRTestInfo, "InsideValidRestoreRange")
		    .detail("Key", kv.key)
		    .detail("Range", range)
		    .detail("Inside", (kv.key >= range.begin && kv.key < range.end));
		if (kv.key >= range.begin && kv.key < range.end) {
			insideRestoreRange = true;
			break;
		}
	}
	for (auto& range : backupRanges) {
		TraceEvent(SevFRTestInfo, "InsideValidBackupRange")
		    .detail("Key", kv.key)
		    .detail("Range", range)
		    .detail("Inside", (kv.key >= range.begin && kv.key < range.end));
		if (kv.key >= range.begin && kv.key < range.end) {
			insideBackupRange = true;
			break;
		}
	}
	return std::make_pair(insideBackupRange, insideRestoreRange);
}

// Write [begin, end) in kvs to DB
ACTOR static Future<Void> writeKVs(Database cx, Standalone<VectorRef<KeyValueRef>> kvs, int begin, int end) {
	wait(runRYWTransaction(cx, [=](Reference<ReadYourWritesTransaction> tr) -> Future<Void> {
		tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
		tr->setOption(FDBTransactionOptions::LOCK_AWARE);
		int index = begin;
		while (index < end) {
			TraceEvent(SevFRTestInfo, "TransformDatabaseContentsWriteKV")
			    .detail("Index", index)
			    .detail("KVs", kvs.size())
			    .detail("Key", kvs[index].key)
			    .detail("Value", kvs[index].value);
			tr->set(kvs[index].key, kvs[index].value);
			++index;
		}
		return Void();
	}));

	// Sanity check data has been written to DB
	state ReadYourWritesTransaction tr(cx);
	loop {
		try {
			tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
			tr.setOption(FDBTransactionOptions::READ_LOCK_AWARE);
			KeyRef k1 = kvs[begin].key;
			KeyRef k2 = end < kvs.size() ? kvs[end].key : allKeys.end;
			TraceEvent(SevFRTestInfo, "TransformDatabaseContentsWriteKVReadBack")
			    .detail("Range", KeyRangeRef(k1, k2))
			    .detail("Begin", begin)
			    .detail("End", end);
			RangeResult readKVs = wait(tr.getRange(KeyRangeRef(k1, k2), CLIENT_KNOBS->TOO_MANY));
			ASSERT(readKVs.size() > 0 || begin == end);
			break;
		} catch (Error& e) {
			TraceEvent("TransformDatabaseContentsWriteKVReadBackError").error(e);
			wait(tr.onError(e));
		}
	}

	TraceEvent(SevFRTestInfo, "TransformDatabaseContentsWriteKVDone").detail("Begin", begin).detail("End", end);

	return Void();
}

// restoreRanges is the actual range that has applied removePrefix and addPrefix processed by restore system
// Assume: restoreRanges do not overlap which is achieved by ensuring backup ranges do not overlap
ACTOR static Future<Void> transformDatabaseContents(Database cx,
                                                    Key addPrefix,
                                                    Key removePrefix,
                                                    Standalone<VectorRef<KeyRangeRef>> restoreRanges) {
	state ReadYourWritesTransaction tr(cx);
	state Standalone<VectorRef<KeyValueRef>> oldData;

	TraceEvent("FastRestoreWorkloadTransformDatabaseContents")
	    .detail("AddPrefix", addPrefix)
	    .detail("RemovePrefix", removePrefix);
	state int i = 0;
	loop { // Read all data from DB
		try {
			tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
			tr.setOption(FDBTransactionOptions::LOCK_AWARE);
			for (i = 0; i < restoreRanges.size(); ++i) {
				RangeResult kvs = wait(tr.getRange(restoreRanges[i], CLIENT_KNOBS->TOO_MANY));
				ASSERT(!kvs.more);
				for (auto kv : kvs) {
					oldData.push_back_deep(oldData.arena(), KeyValueRef(kv.key, kv.value));
				}
			}
			break;
		} catch (Error& e) {
			TraceEvent("FastRestoreWorkloadTransformDatabaseContentsGetAllKeys")
			    .error(e)
			    .detail("Index", i)
			    .detail("RestoreRange", restoreRanges[i]);
			oldData = Standalone<VectorRef<KeyValueRef>>(); // clear the vector
			wait(tr.onError(e));
		}
	}

	// Convert data by removePrefix and addPrefix in memory
	state Standalone<VectorRef<KeyValueRef>> newKVs;
	for (int i = 0; i < oldData.size(); ++i) {
		Key newKey(oldData[i].key);
		TraceEvent(SevFRTestInfo, "TransformDatabaseContents")
		    .detail("Keys", oldData.size())
		    .detail("Index", i)
		    .detail("GetKey", oldData[i].key)
		    .detail("GetValue", oldData[i].value);
		if (newKey.size() < removePrefix.size()) { // If true, must check why.
			TraceEvent(SevError, "TransformDatabaseContents")
			    .detail("Key", newKey)
			    .detail("RemovePrefix", removePrefix);
			continue;
		}
		newKey = newKey.removePrefix(removePrefix).withPrefix(addPrefix);
		newKVs.push_back_deep(newKVs.arena(), KeyValueRef(newKey.contents(), oldData[i].value));
		TraceEvent(SevFRTestInfo, "TransformDatabaseContents")
		    .detail("Keys", newKVs.size())
		    .detail("Index", i)
		    .detail("NewKey", newKVs.back().key)
		    .detail("NewValue", newKVs.back().value);
	}

	state Standalone<VectorRef<KeyRangeRef>> backupRanges; // dest. ranges
	for (auto& range : restoreRanges) {
		KeyRange tmpRange = range;
		backupRanges.push_back_deep(backupRanges.arena(), tmpRange.removePrefix(removePrefix).withPrefix(addPrefix));
	}

	// Clear the transformed data (original data with removePrefix and addPrefix) in restoreRanges
	wait(runRYWTransaction(cx, [=](Reference<ReadYourWritesTransaction> tr) -> Future<Void> {
		tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
		tr->setOption(FDBTransactionOptions::LOCK_AWARE);
		for (int i = 0; i < restoreRanges.size(); i++) {
			TraceEvent(SevFRTestInfo, "TransformDatabaseContents")
			    .detail("ClearRestoreRange", restoreRanges[i])
			    .detail("ClearBackupRange", backupRanges[i]);
			tr->clear(restoreRanges[i]); // Clear the range.removePrefix().withPrefix()
			tr->clear(backupRanges[i]);
		}
		return Void();
	}));

	// Sanity check to ensure no data in the ranges
	tr.reset();
	loop {
		try {
			tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
			tr.setOption(FDBTransactionOptions::LOCK_AWARE);
			RangeResult emptyData = wait(tr.getRange(normalKeys, CLIENT_KNOBS->TOO_MANY));
			for (int i = 0; i < emptyData.size(); ++i) {
				TraceEvent(SevError, "ExpectEmptyData")
				    .detail("Index", i)
				    .detail("Key", emptyData[i].key)
				    .detail("Value", emptyData[i].value);
			}
			break;
		} catch (Error& e) {
			wait(tr.onError(e));
		}
	}

	// Write transformed KVs (i.e., kv backup took) back to DB
	state std::vector<Future<Void>> fwrites;
	loop {
		try {
			state int begin = 0;
			state int len = 0;
			while (begin < newKVs.size()) {
				len = std::min(100, newKVs.size() - begin);
				fwrites.push_back(writeKVs(cx, newKVs, begin, begin + len));
				begin = begin + len;
			}
			wait(waitForAll(fwrites));
			break;
		} catch (Error& e) {
			TraceEvent(SevError, "FastRestoreWorkloadTransformDatabaseContentsUnexpectedErrorOnWriteKVs").error(e);
			wait(tr.onError(e));
		}
	}

	// Sanity check
	tr.reset();
	loop {
		try {
			tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
			tr.setOption(FDBTransactionOptions::LOCK_AWARE);
			RangeResult allData = wait(tr.getRange(normalKeys, CLIENT_KNOBS->TOO_MANY));
			TraceEvent(SevFRTestInfo, "SanityCheckData").detail("Size", allData.size());
			for (int i = 0; i < allData.size(); ++i) {
				std::pair<bool, bool> backupRestoreValid = insideValidRange(allData[i], restoreRanges, backupRanges);
				TraceEvent(backupRestoreValid.first ? SevFRTestInfo : SevError, "SanityCheckData")
				    .detail("Index", i)
				    .detail("Key", allData[i].key)
				    .detail("Value", allData[i].value)
				    .detail("InsideBackupRange", backupRestoreValid.first)
				    .detail("InsideRestoreRange", backupRestoreValid.second);
			}
			break;
		} catch (Error& e) {
			wait(tr.onError(e));
		}
	}

	TraceEvent("FastRestoreWorkloadTransformDatabaseContentsFinish")
	    .detail("AddPrefix", addPrefix)
	    .detail("RemovePrefix", removePrefix);

	return Void();
}

// addPrefix and removePrefix are the options used in the restore request:
// every backup key applied removePrefix and addPrefix in restore;
// transformRestoredDatabase actor will revert it by remove addPrefix and add removePrefix.
ACTOR Future<Void> transformRestoredDatabase(Database cx,
                                             Standalone<VectorRef<KeyRangeRef>> backupRanges,
                                             Key addPrefix,
                                             Key removePrefix) {
	try {
		Standalone<VectorRef<KeyRangeRef>> restoreRanges;
		for (int i = 0; i < backupRanges.size(); ++i) {
			KeyRange range(backupRanges[i]);
			Key begin = range.begin.removePrefix(removePrefix).withPrefix(addPrefix);
			Key end = range.end.removePrefix(removePrefix).withPrefix(addPrefix);
			TraceEvent("FastRestoreTransformRestoredDatabase")
			    .detail("From", KeyRangeRef(begin.contents(), end.contents()))
			    .detail("To", range);
			restoreRanges.push_back_deep(restoreRanges.arena(), KeyRangeRef(begin.contents(), end.contents()));
		}
		wait(transformDatabaseContents(cx, removePrefix, addPrefix, restoreRanges));
	} catch (Error& e) {
		TraceEvent(SevError, "FastRestoreTransformRestoredDatabaseUnexpectedError").error(e);
		throw;
	}

	return Void();
}

void simulateBlobFailure() {
	if (BUGGIFY && deterministicRandom()->random01() < 0.01) { // Simulate blob failures
		double i = deterministicRandom()->random01();
		if (i < 0.5) {
			throw http_request_failed();
		} else if (i < 0.7) {
			throw connection_failed();
		} else if (i < 0.8) {
			throw timed_out();
		} else if (i < 0.9) {
			throw lookup_failed();
		}
	}
}
